| { | |
| "0": { | |
| "layer": 0, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0848, | |
| "q25_var64": 0.0844, | |
| "q75_var64": 0.0852, | |
| "mean_s0": 2.88, | |
| "std_s0": 0.21, | |
| "mean_s0_ratio": 1.02, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1268, | |
| "s0": 3.53, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.5374, | |
| "s0": 7.86, | |
| "s0_s1": 1.15, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "1": { | |
| "layer": 1, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0866, | |
| "q25_var64": 0.0865, | |
| "q75_var64": 0.0868, | |
| "mean_s0": 2.84, | |
| "std_s0": 0.09, | |
| "mean_s0_ratio": 1.01, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1906, | |
| "s0": 3.8, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4951, | |
| "s0": 4.92, | |
| "s0_s1": 1.44, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "2": { | |
| "layer": 2, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0865, | |
| "q25_var64": 0.0863, | |
| "q75_var64": 0.0867, | |
| "mean_s0": 2.87, | |
| "std_s0": 0.35, | |
| "mean_s0_ratio": 1.02, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2442, | |
| "s0": 3.8, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4392, | |
| "s0": 3.65, | |
| "s0_s1": 1.07, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "3": { | |
| "layer": 3, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0998, | |
| "q25_var64": 0.095, | |
| "q75_var64": 0.1042, | |
| "mean_s0": 3.72, | |
| "std_s0": 0.47, | |
| "mean_s0_ratio": 1.11, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.0948, | |
| "s0": 4.65, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4697, | |
| "s0": 6.69, | |
| "s0_s1": 1.09, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "4": { | |
| "layer": 4, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1017, | |
| "q25_var64": 0.097, | |
| "q75_var64": 0.1068, | |
| "mean_s0": 3.9, | |
| "std_s0": 0.53, | |
| "mean_s0_ratio": 1.12, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1239, | |
| "s0": 4.34, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4817, | |
| "s0": 6.34, | |
| "s0_s1": 1.15, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "5": { | |
| "layer": 5, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1062, | |
| "q25_var64": 0.0981, | |
| "q75_var64": 0.1157, | |
| "mean_s0": 4.2, | |
| "std_s0": 0.65, | |
| "mean_s0_ratio": 1.16, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1554, | |
| "s0": 3.83, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4402, | |
| "s0": 6.17, | |
| "s0_s1": 1.23, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "6": { | |
| "layer": 6, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1017, | |
| "q25_var64": 0.0963, | |
| "q75_var64": 0.1079, | |
| "mean_s0": 4.11, | |
| "std_s0": 0.62, | |
| "mean_s0_ratio": 1.16, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1669, | |
| "s0": 4.28, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.437, | |
| "s0": 5.39, | |
| "s0_s1": 1.12, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "7": { | |
| "layer": 7, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1055, | |
| "q25_var64": 0.0982, | |
| "q75_var64": 0.1137, | |
| "mean_s0": 4.26, | |
| "std_s0": 0.56, | |
| "mean_s0_ratio": 1.19, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1686, | |
| "s0": 4.15, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4548, | |
| "s0": 5.14, | |
| "s0_s1": 1.07, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "8": { | |
| "layer": 8, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.104, | |
| "q25_var64": 0.0992, | |
| "q75_var64": 0.1135, | |
| "mean_s0": 4.28, | |
| "std_s0": 0.68, | |
| "mean_s0_ratio": 1.19, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.194, | |
| "s0": 4.24, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4371, | |
| "s0": 5.56, | |
| "s0_s1": 1.14, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "9": { | |
| "layer": 9, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1122, | |
| "q25_var64": 0.105, | |
| "q75_var64": 0.1233, | |
| "mean_s0": 4.62, | |
| "std_s0": 0.8, | |
| "mean_s0_ratio": 1.2, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2007, | |
| "s0": 4.39, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4314, | |
| "s0": 6.46, | |
| "s0_s1": 1.24, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "10": { | |
| "layer": 10, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1172, | |
| "q25_var64": 0.1082, | |
| "q75_var64": 0.1314, | |
| "mean_s0": 4.78, | |
| "std_s0": 0.81, | |
| "mean_s0_ratio": 1.21, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1916, | |
| "s0": 4.74, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4471, | |
| "s0": 6.66, | |
| "s0_s1": 1.3, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "11": { | |
| "layer": 11, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1153, | |
| "q25_var64": 0.1045, | |
| "q75_var64": 0.1313, | |
| "mean_s0": 4.62, | |
| "std_s0": 1.02, | |
| "mean_s0_ratio": 1.19, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1849, | |
| "s0": 4.12, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4657, | |
| "s0": 6.93, | |
| "s0_s1": 1.29, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "12": { | |
| "layer": 12, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1208, | |
| "q25_var64": 0.109, | |
| "q75_var64": 0.1364, | |
| "mean_s0": 4.9, | |
| "std_s0": 0.9, | |
| "mean_s0_ratio": 1.22, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2135, | |
| "s0": 4.62, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4562, | |
| "s0": 6.02, | |
| "s0_s1": 1.18, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "13": { | |
| "layer": 13, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1286, | |
| "q25_var64": 0.1127, | |
| "q75_var64": 0.1449, | |
| "mean_s0": 5.15, | |
| "std_s0": 0.97, | |
| "mean_s0_ratio": 1.26, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2054, | |
| "s0": 4.42, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4671, | |
| "s0": 6.36, | |
| "s0_s1": 1.25, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "14": { | |
| "layer": 14, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1426, | |
| "q25_var64": 0.1234, | |
| "q75_var64": 0.1575, | |
| "mean_s0": 5.46, | |
| "std_s0": 1.03, | |
| "mean_s0_ratio": 1.25, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2041, | |
| "s0": 4.94, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4733, | |
| "s0": 6.23, | |
| "s0_s1": 1.15, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "15": { | |
| "layer": 15, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1413, | |
| "q25_var64": 0.1179, | |
| "q75_var64": 0.1633, | |
| "mean_s0": 5.49, | |
| "std_s0": 1.18, | |
| "mean_s0_ratio": 1.24, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2034, | |
| "s0": 4.99, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4797, | |
| "s0": 6.75, | |
| "s0_s1": 1.22, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "16": { | |
| "layer": 16, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1471, | |
| "q25_var64": 0.1232, | |
| "q75_var64": 0.1748, | |
| "mean_s0": 5.65, | |
| "std_s0": 1.25, | |
| "mean_s0_ratio": 1.2, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1954, | |
| "s0": 5.52, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4886, | |
| "s0": 6.54, | |
| "s0_s1": 1.22, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "17": { | |
| "layer": 17, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1523, | |
| "q25_var64": 0.1307, | |
| "q75_var64": 0.1797, | |
| "mean_s0": 5.99, | |
| "std_s0": 1.35, | |
| "mean_s0_ratio": 1.22, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1902, | |
| "s0": 4.94, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4963, | |
| "s0": 6.21, | |
| "s0_s1": 1.26, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "18": { | |
| "layer": 18, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1487, | |
| "q25_var64": 0.1313, | |
| "q75_var64": 0.17, | |
| "mean_s0": 5.9, | |
| "std_s0": 1.36, | |
| "mean_s0_ratio": 1.19, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1949, | |
| "s0": 6.07, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4994, | |
| "s0": 5.86, | |
| "s0_s1": 1.17, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "19": { | |
| "layer": 19, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1157, | |
| "q25_var64": 0.1058, | |
| "q75_var64": 0.1274, | |
| "mean_s0": 4.61, | |
| "std_s0": 0.89, | |
| "mean_s0_ratio": 1.13, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1279, | |
| "s0": 4.84, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4813, | |
| "s0": 8.17, | |
| "s0_s1": 1.25, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "20": { | |
| "layer": 20, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1415, | |
| "q25_var64": 0.1242, | |
| "q75_var64": 0.1635, | |
| "mean_s0": 5.69, | |
| "std_s0": 1.37, | |
| "mean_s0_ratio": 1.17, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.215, | |
| "s0": 6.22, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.5111, | |
| "s0": 6.1, | |
| "s0_s1": 1.25, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "21": { | |
| "layer": 21, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1292, | |
| "q25_var64": 0.1037, | |
| "q75_var64": 0.1529, | |
| "mean_s0": 5.3, | |
| "std_s0": 1.5, | |
| "mean_s0_ratio": 1.17, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2144, | |
| "s0": 5.84, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4869, | |
| "s0": 6.86, | |
| "s0_s1": 1.1, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "22": { | |
| "layer": 22, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1209, | |
| "q25_var64": 0.0982, | |
| "q75_var64": 0.1496, | |
| "mean_s0": 5.0, | |
| "std_s0": 1.21, | |
| "mean_s0_ratio": 1.15, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2101, | |
| "s0": 4.86, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4821, | |
| "s0": 7.01, | |
| "s0_s1": 1.23, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "23": { | |
| "layer": 23, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0888, | |
| "q25_var64": 0.0873, | |
| "q75_var64": 0.0948, | |
| "mean_s0": 3.42, | |
| "std_s0": 0.76, | |
| "mean_s0_ratio": 1.08, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1911, | |
| "s0": 5.09, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.484, | |
| "s0": 9.02, | |
| "s0_s1": 1.1, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "24": { | |
| "layer": 24, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0916, | |
| "q25_var64": 0.0878, | |
| "q75_var64": 0.1033, | |
| "mean_s0": 3.65, | |
| "std_s0": 0.9, | |
| "mean_s0_ratio": 1.1, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1699, | |
| "s0": 4.09, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4696, | |
| "s0": 7.31, | |
| "s0_s1": 1.13, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "25": { | |
| "layer": 25, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0953, | |
| "q25_var64": 0.0903, | |
| "q75_var64": 0.109, | |
| "mean_s0": 3.96, | |
| "std_s0": 1.16, | |
| "mean_s0_ratio": 1.14, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2279, | |
| "s0": 4.9, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4568, | |
| "s0": 7.36, | |
| "s0_s1": 1.05, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "26": { | |
| "layer": 26, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0954, | |
| "q25_var64": 0.0915, | |
| "q75_var64": 0.1058, | |
| "mean_s0": 3.86, | |
| "std_s0": 0.99, | |
| "mean_s0_ratio": 1.11, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.199, | |
| "s0": 4.62, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4361, | |
| "s0": 6.16, | |
| "s0_s1": 1.18, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "27": { | |
| "layer": 27, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0981, | |
| "q25_var64": 0.0943, | |
| "q75_var64": 0.1058, | |
| "mean_s0": 3.96, | |
| "std_s0": 1.23, | |
| "mean_s0_ratio": 1.13, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.3269, | |
| "s0": 7.17, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.444, | |
| "s0": 7.06, | |
| "s0_s1": 1.13, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "28": { | |
| "layer": 28, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0938, | |
| "q25_var64": 0.09, | |
| "q75_var64": 0.1045, | |
| "mean_s0": 3.95, | |
| "std_s0": 1.19, | |
| "mean_s0_ratio": 1.14, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2398, | |
| "s0": 4.73, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4454, | |
| "s0": 6.37, | |
| "s0_s1": 1.19, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "29": { | |
| "layer": 29, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0937, | |
| "q25_var64": 0.089, | |
| "q75_var64": 0.11, | |
| "mean_s0": 3.95, | |
| "std_s0": 1.2, | |
| "mean_s0_ratio": 1.12, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2463, | |
| "s0": 5.85, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4549, | |
| "s0": 6.69, | |
| "s0_s1": 1.14, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "30": { | |
| "layer": 30, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0968, | |
| "q25_var64": 0.0905, | |
| "q75_var64": 0.1132, | |
| "mean_s0": 4.16, | |
| "std_s0": 1.32, | |
| "mean_s0_ratio": 1.14, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2351, | |
| "s0": 6.71, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4442, | |
| "s0": 6.39, | |
| "s0_s1": 1.17, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "31": { | |
| "layer": 31, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0947, | |
| "q25_var64": 0.0906, | |
| "q75_var64": 0.1092, | |
| "mean_s0": 4.03, | |
| "std_s0": 1.18, | |
| "mean_s0_ratio": 1.14, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2778, | |
| "s0": 7.04, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4478, | |
| "s0": 6.75, | |
| "s0_s1": 1.14, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "32": { | |
| "layer": 32, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.092, | |
| "q25_var64": 0.0892, | |
| "q75_var64": 0.101, | |
| "mean_s0": 3.83, | |
| "std_s0": 1.11, | |
| "mean_s0_ratio": 1.11, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2878, | |
| "s0": 6.94, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4465, | |
| "s0": 5.95, | |
| "s0_s1": 1.14, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "33": { | |
| "layer": 33, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0925, | |
| "q25_var64": 0.0898, | |
| "q75_var64": 0.1035, | |
| "mean_s0": 3.79, | |
| "std_s0": 1.06, | |
| "mean_s0_ratio": 1.11, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2721, | |
| "s0": 5.62, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4406, | |
| "s0": 5.59, | |
| "s0_s1": 1.05, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "34": { | |
| "layer": 34, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0922, | |
| "q25_var64": 0.09, | |
| "q75_var64": 0.1012, | |
| "mean_s0": 3.71, | |
| "std_s0": 1.17, | |
| "mean_s0_ratio": 1.09, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2554, | |
| "s0": 6.2, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4352, | |
| "s0": 5.0, | |
| "s0_s1": 1.08, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "35": { | |
| "layer": 35, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1022, | |
| "q25_var64": 0.0955, | |
| "q75_var64": 0.1166, | |
| "mean_s0": 4.32, | |
| "std_s0": 1.84, | |
| "mean_s0_ratio": 1.13, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2325, | |
| "s0": 6.58, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4464, | |
| "s0": 5.85, | |
| "s0_s1": 1.13, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "36": { | |
| "layer": 36, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0958, | |
| "q25_var64": 0.0918, | |
| "q75_var64": 0.1045, | |
| "mean_s0": 3.78, | |
| "std_s0": 1.0, | |
| "mean_s0_ratio": 1.1, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1882, | |
| "s0": 7.17, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.452, | |
| "s0": 4.96, | |
| "s0_s1": 1.2, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "37": { | |
| "layer": 37, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0949, | |
| "q25_var64": 0.0904, | |
| "q75_var64": 0.1048, | |
| "mean_s0": 4.02, | |
| "std_s0": 1.65, | |
| "mean_s0_ratio": 1.13, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2981, | |
| "s0": 8.46, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4593, | |
| "s0": 4.93, | |
| "s0_s1": 1.09, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "38": { | |
| "layer": 38, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.098, | |
| "q25_var64": 0.0917, | |
| "q75_var64": 0.1083, | |
| "mean_s0": 4.0, | |
| "std_s0": 1.14, | |
| "mean_s0_ratio": 1.1, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2166, | |
| "s0": 8.46, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4608, | |
| "s0": 4.84, | |
| "s0_s1": 1.09, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "39": { | |
| "layer": 39, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1116, | |
| "q25_var64": 0.1027, | |
| "q75_var64": 0.1299, | |
| "mean_s0": 5.53, | |
| "std_s0": 2.34, | |
| "mean_s0_ratio": 1.25, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2764, | |
| "s0": 8.83, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4941, | |
| "s0": 4.11, | |
| "s0_s1": 1.06, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "40": { | |
| "layer": 40, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1027, | |
| "q25_var64": 0.0943, | |
| "q75_var64": 0.1168, | |
| "mean_s0": 5.01, | |
| "std_s0": 1.97, | |
| "mean_s0_ratio": 1.19, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.3241, | |
| "s0": 9.94, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.4982, | |
| "s0": 4.8, | |
| "s0_s1": 1.15, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "41": { | |
| "layer": 41, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0988, | |
| "q25_var64": 0.0921, | |
| "q75_var64": 0.1127, | |
| "mean_s0": 5.16, | |
| "std_s0": 2.69, | |
| "mean_s0_ratio": 1.16, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2952, | |
| "s0": 8.3, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.49, | |
| "s0": 5.52, | |
| "s0_s1": 1.22, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| }, | |
| "42": { | |
| "layer": 42, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0997, | |
| "q25_var64": 0.0938, | |
| "q75_var64": 0.1085, | |
| "mean_s0": 5.85, | |
| "std_s0": 4.75, | |
| "mean_s0_ratio": 1.18, | |
| "n_experts": 256 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2863, | |
| "s0": 16.44, | |
| "shape": [ | |
| 4096, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.5181, | |
| "s0": 6.69, | |
| "s0_s1": 1.35, | |
| "shape": [ | |
| 256, | |
| 4096 | |
| ] | |
| } | |
| } | |
| } |