| { | |
| "0": { | |
| "layer": 0, | |
| "is_moe": false, | |
| "dense": { | |
| "var64": 0.0373, | |
| "s0": 10.7, | |
| "shape": [ | |
| 7168, | |
| 18432 | |
| ] | |
| } | |
| }, | |
| "1": { | |
| "layer": 1, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0824, | |
| "q25_var64": 0.0758, | |
| "q75_var64": 0.0866, | |
| "mean_s0": 4.69, | |
| "std_s0": 0.95, | |
| "mean_s0_ratio": 1.11, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1127, | |
| "s0": 8.05, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.5682, | |
| "s0": 26.89, | |
| "s0_s1": 1.25, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "2": { | |
| "layer": 2, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0883, | |
| "q25_var64": 0.0827, | |
| "q75_var64": 0.0918, | |
| "mean_s0": 5.09, | |
| "std_s0": 0.81, | |
| "mean_s0_ratio": 1.14, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1041, | |
| "s0": 7.36, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.393, | |
| "s0": 14.9, | |
| "s0_s1": 1.0, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "3": { | |
| "layer": 3, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0901, | |
| "q25_var64": 0.0868, | |
| "q75_var64": 0.0934, | |
| "mean_s0": 5.05, | |
| "std_s0": 0.61, | |
| "mean_s0_ratio": 1.12, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.127, | |
| "s0": 7.47, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3403, | |
| "s0": 15.09, | |
| "s0_s1": 1.06, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "4": { | |
| "layer": 4, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0906, | |
| "q25_var64": 0.0869, | |
| "q75_var64": 0.0966, | |
| "mean_s0": 5.3, | |
| "std_s0": 0.68, | |
| "mean_s0_ratio": 1.15, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1212, | |
| "s0": 6.56, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.35, | |
| "s0": 15.06, | |
| "s0_s1": 1.08, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "5": { | |
| "layer": 5, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0888, | |
| "q25_var64": 0.0849, | |
| "q75_var64": 0.0935, | |
| "mean_s0": 5.25, | |
| "std_s0": 0.71, | |
| "mean_s0_ratio": 1.15, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1513, | |
| "s0": 7.33, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3367, | |
| "s0": 15.28, | |
| "s0_s1": 1.21, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "6": { | |
| "layer": 6, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0865, | |
| "q25_var64": 0.083, | |
| "q75_var64": 0.0911, | |
| "mean_s0": 5.0, | |
| "std_s0": 0.67, | |
| "mean_s0_ratio": 1.15, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1625, | |
| "s0": 7.12, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3258, | |
| "s0": 13.32, | |
| "s0_s1": 1.08, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "7": { | |
| "layer": 7, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0885, | |
| "q25_var64": 0.0845, | |
| "q75_var64": 0.0929, | |
| "mean_s0": 5.11, | |
| "std_s0": 0.66, | |
| "mean_s0_ratio": 1.15, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1665, | |
| "s0": 6.61, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3163, | |
| "s0": 12.09, | |
| "s0_s1": 1.05, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "8": { | |
| "layer": 8, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0898, | |
| "q25_var64": 0.085, | |
| "q75_var64": 0.0943, | |
| "mean_s0": 5.17, | |
| "std_s0": 0.66, | |
| "mean_s0_ratio": 1.16, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1639, | |
| "s0": 6.76, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3089, | |
| "s0": 12.21, | |
| "s0_s1": 1.1, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "9": { | |
| "layer": 9, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0912, | |
| "q25_var64": 0.0867, | |
| "q75_var64": 0.0967, | |
| "mean_s0": 5.28, | |
| "std_s0": 0.71, | |
| "mean_s0_ratio": 1.17, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1638, | |
| "s0": 6.9, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3083, | |
| "s0": 12.38, | |
| "s0_s1": 1.1, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "10": { | |
| "layer": 10, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0919, | |
| "q25_var64": 0.0873, | |
| "q75_var64": 0.0985, | |
| "mean_s0": 5.42, | |
| "std_s0": 0.8, | |
| "mean_s0_ratio": 1.18, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1611, | |
| "s0": 6.26, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3106, | |
| "s0": 11.63, | |
| "s0_s1": 1.08, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "11": { | |
| "layer": 11, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0946, | |
| "q25_var64": 0.0885, | |
| "q75_var64": 0.1006, | |
| "mean_s0": 5.56, | |
| "std_s0": 0.89, | |
| "mean_s0_ratio": 1.19, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1712, | |
| "s0": 6.68, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3061, | |
| "s0": 11.2, | |
| "s0_s1": 1.04, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "12": { | |
| "layer": 12, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.098, | |
| "q25_var64": 0.0915, | |
| "q75_var64": 0.1053, | |
| "mean_s0": 5.8, | |
| "std_s0": 0.91, | |
| "mean_s0_ratio": 1.2, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1517, | |
| "s0": 6.36, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3161, | |
| "s0": 10.69, | |
| "s0_s1": 1.04, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "13": { | |
| "layer": 13, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1008, | |
| "q25_var64": 0.0926, | |
| "q75_var64": 0.1085, | |
| "mean_s0": 6.03, | |
| "std_s0": 1.01, | |
| "mean_s0_ratio": 1.23, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1701, | |
| "s0": 7.06, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3245, | |
| "s0": 10.72, | |
| "s0_s1": 1.06, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "14": { | |
| "layer": 14, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1007, | |
| "q25_var64": 0.0912, | |
| "q75_var64": 0.109, | |
| "mean_s0": 6.11, | |
| "std_s0": 1.04, | |
| "mean_s0_ratio": 1.24, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1624, | |
| "s0": 6.23, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3245, | |
| "s0": 10.31, | |
| "s0_s1": 1.04, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "15": { | |
| "layer": 15, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1006, | |
| "q25_var64": 0.0902, | |
| "q75_var64": 0.1102, | |
| "mean_s0": 6.21, | |
| "std_s0": 1.14, | |
| "mean_s0_ratio": 1.24, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1904, | |
| "s0": 7.38, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3269, | |
| "s0": 10.12, | |
| "s0_s1": 1.02, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "16": { | |
| "layer": 16, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0989, | |
| "q25_var64": 0.0897, | |
| "q75_var64": 0.1117, | |
| "mean_s0": 6.12, | |
| "std_s0": 1.13, | |
| "mean_s0_ratio": 1.23, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1764, | |
| "s0": 7.03, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.328, | |
| "s0": 10.11, | |
| "s0_s1": 1.08, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "17": { | |
| "layer": 17, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1036, | |
| "q25_var64": 0.0912, | |
| "q75_var64": 0.1152, | |
| "mean_s0": 6.38, | |
| "std_s0": 1.24, | |
| "mean_s0_ratio": 1.24, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1793, | |
| "s0": 7.02, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3172, | |
| "s0": 9.52, | |
| "s0_s1": 1.07, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "18": { | |
| "layer": 18, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1039, | |
| "q25_var64": 0.0895, | |
| "q75_var64": 0.1176, | |
| "mean_s0": 6.36, | |
| "std_s0": 1.3, | |
| "mean_s0_ratio": 1.23, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2005, | |
| "s0": 7.53, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3238, | |
| "s0": 9.68, | |
| "s0_s1": 1.1, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "19": { | |
| "layer": 19, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1056, | |
| "q25_var64": 0.0925, | |
| "q75_var64": 0.121, | |
| "mean_s0": 6.44, | |
| "std_s0": 1.32, | |
| "mean_s0_ratio": 1.22, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1877, | |
| "s0": 7.54, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.317, | |
| "s0": 9.43, | |
| "s0_s1": 1.12, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "20": { | |
| "layer": 20, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1053, | |
| "q25_var64": 0.0901, | |
| "q75_var64": 0.122, | |
| "mean_s0": 6.56, | |
| "std_s0": 1.42, | |
| "mean_s0_ratio": 1.25, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1874, | |
| "s0": 7.33, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3196, | |
| "s0": 9.25, | |
| "s0_s1": 1.14, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "21": { | |
| "layer": 21, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.107, | |
| "q25_var64": 0.0927, | |
| "q75_var64": 0.124, | |
| "mean_s0": 6.66, | |
| "std_s0": 1.46, | |
| "mean_s0_ratio": 1.24, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1809, | |
| "s0": 8.29, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3178, | |
| "s0": 8.62, | |
| "s0_s1": 1.08, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "22": { | |
| "layer": 22, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1082, | |
| "q25_var64": 0.0912, | |
| "q75_var64": 0.1254, | |
| "mean_s0": 6.84, | |
| "std_s0": 1.6, | |
| "mean_s0_ratio": 1.24, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1963, | |
| "s0": 8.41, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3277, | |
| "s0": 9.11, | |
| "s0_s1": 1.2, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "23": { | |
| "layer": 23, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.105, | |
| "q25_var64": 0.0857, | |
| "q75_var64": 0.1226, | |
| "mean_s0": 6.57, | |
| "std_s0": 1.63, | |
| "mean_s0_ratio": 1.23, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1836, | |
| "s0": 7.33, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3275, | |
| "s0": 9.25, | |
| "s0_s1": 1.25, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "24": { | |
| "layer": 24, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1066, | |
| "q25_var64": 0.0887, | |
| "q75_var64": 0.1278, | |
| "mean_s0": 6.81, | |
| "std_s0": 1.67, | |
| "mean_s0_ratio": 1.23, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.201, | |
| "s0": 8.72, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.331, | |
| "s0": 8.97, | |
| "s0_s1": 1.15, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "25": { | |
| "layer": 25, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1016, | |
| "q25_var64": 0.0864, | |
| "q75_var64": 0.1222, | |
| "mean_s0": 6.68, | |
| "std_s0": 1.76, | |
| "mean_s0_ratio": 1.23, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2033, | |
| "s0": 8.59, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3307, | |
| "s0": 8.75, | |
| "s0_s1": 1.09, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "26": { | |
| "layer": 26, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.1003, | |
| "q25_var64": 0.0852, | |
| "q75_var64": 0.1226, | |
| "mean_s0": 6.65, | |
| "std_s0": 1.71, | |
| "mean_s0_ratio": 1.24, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1798, | |
| "s0": 7.47, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3257, | |
| "s0": 8.48, | |
| "s0_s1": 1.11, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "27": { | |
| "layer": 27, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0973, | |
| "q25_var64": 0.0849, | |
| "q75_var64": 0.1213, | |
| "mean_s0": 6.56, | |
| "std_s0": 1.75, | |
| "mean_s0_ratio": 1.21, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2018, | |
| "s0": 9.15, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3246, | |
| "s0": 8.54, | |
| "s0_s1": 1.09, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "28": { | |
| "layer": 28, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0938, | |
| "q25_var64": 0.084, | |
| "q75_var64": 0.1134, | |
| "mean_s0": 6.38, | |
| "std_s0": 1.69, | |
| "mean_s0_ratio": 1.22, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1913, | |
| "s0": 8.05, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3196, | |
| "s0": 8.99, | |
| "s0_s1": 1.21, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "29": { | |
| "layer": 29, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0923, | |
| "q25_var64": 0.0816, | |
| "q75_var64": 0.1099, | |
| "mean_s0": 6.17, | |
| "std_s0": 1.64, | |
| "mean_s0_ratio": 1.2, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2067, | |
| "s0": 9.68, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3241, | |
| "s0": 8.27, | |
| "s0_s1": 1.11, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "30": { | |
| "layer": 30, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0887, | |
| "q25_var64": 0.0808, | |
| "q75_var64": 0.1053, | |
| "mean_s0": 5.97, | |
| "std_s0": 1.51, | |
| "mean_s0_ratio": 1.19, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.198, | |
| "s0": 8.47, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3189, | |
| "s0": 7.42, | |
| "s0_s1": 1.03, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "31": { | |
| "layer": 31, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0882, | |
| "q25_var64": 0.0805, | |
| "q75_var64": 0.1005, | |
| "mean_s0": 5.87, | |
| "std_s0": 1.46, | |
| "mean_s0_ratio": 1.2, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1928, | |
| "s0": 7.81, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3221, | |
| "s0": 7.39, | |
| "s0_s1": 1.04, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "32": { | |
| "layer": 32, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0854, | |
| "q25_var64": 0.08, | |
| "q75_var64": 0.099, | |
| "mean_s0": 5.76, | |
| "std_s0": 1.42, | |
| "mean_s0_ratio": 1.2, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1734, | |
| "s0": 6.74, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3212, | |
| "s0": 7.2, | |
| "s0_s1": 1.01, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "33": { | |
| "layer": 33, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0863, | |
| "q25_var64": 0.0797, | |
| "q75_var64": 0.0977, | |
| "mean_s0": 5.69, | |
| "std_s0": 1.39, | |
| "mean_s0_ratio": 1.18, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1844, | |
| "s0": 7.76, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3126, | |
| "s0": 6.95, | |
| "s0_s1": 1.02, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "34": { | |
| "layer": 34, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0855, | |
| "q25_var64": 0.0797, | |
| "q75_var64": 0.0937, | |
| "mean_s0": 5.47, | |
| "std_s0": 1.27, | |
| "mean_s0_ratio": 1.16, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1759, | |
| "s0": 7.64, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3174, | |
| "s0": 7.07, | |
| "s0_s1": 1.04, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "35": { | |
| "layer": 35, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0835, | |
| "q25_var64": 0.0798, | |
| "q75_var64": 0.0937, | |
| "mean_s0": 5.36, | |
| "std_s0": 1.24, | |
| "mean_s0_ratio": 1.15, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1869, | |
| "s0": 8.0, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3224, | |
| "s0": 7.25, | |
| "s0_s1": 1.08, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "36": { | |
| "layer": 36, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0838, | |
| "q25_var64": 0.0797, | |
| "q75_var64": 0.0944, | |
| "mean_s0": 5.35, | |
| "std_s0": 1.29, | |
| "mean_s0_ratio": 1.15, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2021, | |
| "s0": 8.27, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3192, | |
| "s0": 7.19, | |
| "s0_s1": 1.08, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "37": { | |
| "layer": 37, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0834, | |
| "q25_var64": 0.0794, | |
| "q75_var64": 0.0939, | |
| "mean_s0": 5.24, | |
| "std_s0": 1.25, | |
| "mean_s0_ratio": 1.13, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1936, | |
| "s0": 8.42, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3227, | |
| "s0": 6.63, | |
| "s0_s1": 1.02, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "38": { | |
| "layer": 38, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.083, | |
| "q25_var64": 0.0789, | |
| "q75_var64": 0.0904, | |
| "mean_s0": 5.13, | |
| "std_s0": 1.17, | |
| "mean_s0_ratio": 1.13, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1899, | |
| "s0": 7.06, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3145, | |
| "s0": 6.21, | |
| "s0_s1": 1.06, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "39": { | |
| "layer": 39, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0823, | |
| "q25_var64": 0.079, | |
| "q75_var64": 0.0911, | |
| "mean_s0": 5.02, | |
| "std_s0": 1.16, | |
| "mean_s0_ratio": 1.11, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1876, | |
| "s0": 6.88, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3198, | |
| "s0": 6.36, | |
| "s0_s1": 1.07, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "40": { | |
| "layer": 40, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0823, | |
| "q25_var64": 0.0796, | |
| "q75_var64": 0.0878, | |
| "mean_s0": 4.89, | |
| "std_s0": 1.02, | |
| "mean_s0_ratio": 1.1, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1709, | |
| "s0": 6.94, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3158, | |
| "s0": 6.25, | |
| "s0_s1": 1.07, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "41": { | |
| "layer": 41, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0829, | |
| "q25_var64": 0.0795, | |
| "q75_var64": 0.0903, | |
| "mean_s0": 4.91, | |
| "std_s0": 1.04, | |
| "mean_s0_ratio": 1.1, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1735, | |
| "s0": 7.57, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3159, | |
| "s0": 6.03, | |
| "s0_s1": 1.1, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "42": { | |
| "layer": 42, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0835, | |
| "q25_var64": 0.0802, | |
| "q75_var64": 0.089, | |
| "mean_s0": 4.93, | |
| "std_s0": 1.05, | |
| "mean_s0_ratio": 1.1, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1951, | |
| "s0": 7.85, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3156, | |
| "s0": 5.87, | |
| "s0_s1": 1.12, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "43": { | |
| "layer": 43, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0824, | |
| "q25_var64": 0.0786, | |
| "q75_var64": 0.0907, | |
| "mean_s0": 4.92, | |
| "std_s0": 1.05, | |
| "mean_s0_ratio": 1.1, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1947, | |
| "s0": 7.69, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3135, | |
| "s0": 5.64, | |
| "s0_s1": 1.09, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "44": { | |
| "layer": 44, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0827, | |
| "q25_var64": 0.0791, | |
| "q75_var64": 0.0905, | |
| "mean_s0": 5.0, | |
| "std_s0": 1.14, | |
| "mean_s0_ratio": 1.11, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2133, | |
| "s0": 8.59, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3076, | |
| "s0": 5.35, | |
| "s0_s1": 1.08, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "45": { | |
| "layer": 45, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0826, | |
| "q25_var64": 0.0792, | |
| "q75_var64": 0.0883, | |
| "mean_s0": 4.85, | |
| "std_s0": 1.01, | |
| "mean_s0_ratio": 1.1, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2006, | |
| "s0": 7.54, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.312, | |
| "s0": 5.27, | |
| "s0_s1": 1.09, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "46": { | |
| "layer": 46, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0833, | |
| "q25_var64": 0.0795, | |
| "q75_var64": 0.0896, | |
| "mean_s0": 4.89, | |
| "std_s0": 1.05, | |
| "mean_s0_ratio": 1.09, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1878, | |
| "s0": 7.28, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3125, | |
| "s0": 5.16, | |
| "s0_s1": 1.1, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "47": { | |
| "layer": 47, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0833, | |
| "q25_var64": 0.0799, | |
| "q75_var64": 0.0893, | |
| "mean_s0": 4.79, | |
| "std_s0": 0.91, | |
| "mean_s0_ratio": 1.09, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1801, | |
| "s0": 7.55, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3063, | |
| "s0": 4.89, | |
| "s0_s1": 1.07, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "48": { | |
| "layer": 48, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0831, | |
| "q25_var64": 0.0806, | |
| "q75_var64": 0.0897, | |
| "mean_s0": 4.78, | |
| "std_s0": 0.94, | |
| "mean_s0_ratio": 1.08, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.181, | |
| "s0": 8.07, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3137, | |
| "s0": 5.08, | |
| "s0_s1": 1.08, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "49": { | |
| "layer": 49, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0851, | |
| "q25_var64": 0.0812, | |
| "q75_var64": 0.0917, | |
| "mean_s0": 4.93, | |
| "std_s0": 1.03, | |
| "mean_s0_ratio": 1.09, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1804, | |
| "s0": 7.38, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3127, | |
| "s0": 4.81, | |
| "s0_s1": 1.07, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "50": { | |
| "layer": 50, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0848, | |
| "q25_var64": 0.0818, | |
| "q75_var64": 0.0917, | |
| "mean_s0": 5.01, | |
| "std_s0": 1.19, | |
| "mean_s0_ratio": 1.09, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1817, | |
| "s0": 7.56, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3103, | |
| "s0": 4.57, | |
| "s0_s1": 1.07, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "51": { | |
| "layer": 51, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.085, | |
| "q25_var64": 0.0818, | |
| "q75_var64": 0.0929, | |
| "mean_s0": 5.01, | |
| "std_s0": 1.18, | |
| "mean_s0_ratio": 1.1, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1766, | |
| "s0": 7.63, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.308, | |
| "s0": 4.49, | |
| "s0_s1": 1.12, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "52": { | |
| "layer": 52, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0876, | |
| "q25_var64": 0.0833, | |
| "q75_var64": 0.0938, | |
| "mean_s0": 5.25, | |
| "std_s0": 1.65, | |
| "mean_s0_ratio": 1.12, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1841, | |
| "s0": 7.68, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3099, | |
| "s0": 4.48, | |
| "s0_s1": 1.11, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "53": { | |
| "layer": 53, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.087, | |
| "q25_var64": 0.0833, | |
| "q75_var64": 0.0947, | |
| "mean_s0": 5.26, | |
| "std_s0": 1.46, | |
| "mean_s0_ratio": 1.11, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1912, | |
| "s0": 8.26, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3049, | |
| "s0": 4.14, | |
| "s0_s1": 1.08, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "54": { | |
| "layer": 54, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0887, | |
| "q25_var64": 0.0843, | |
| "q75_var64": 0.0965, | |
| "mean_s0": 5.51, | |
| "std_s0": 1.41, | |
| "mean_s0_ratio": 1.14, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2034, | |
| "s0": 8.45, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3038, | |
| "s0": 4.05, | |
| "s0_s1": 1.05, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "55": { | |
| "layer": 55, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.086, | |
| "q25_var64": 0.0811, | |
| "q75_var64": 0.0957, | |
| "mean_s0": 5.6, | |
| "std_s0": 1.82, | |
| "mean_s0_ratio": 1.15, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2073, | |
| "s0": 7.98, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3091, | |
| "s0": 4.33, | |
| "s0_s1": 1.15, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "56": { | |
| "layer": 56, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0863, | |
| "q25_var64": 0.0815, | |
| "q75_var64": 0.0965, | |
| "mean_s0": 5.69, | |
| "std_s0": 1.93, | |
| "mean_s0_ratio": 1.16, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2141, | |
| "s0": 8.61, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3167, | |
| "s0": 4.19, | |
| "s0_s1": 1.11, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "57": { | |
| "layer": 57, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.086, | |
| "q25_var64": 0.0797, | |
| "q75_var64": 0.0987, | |
| "mean_s0": 5.79, | |
| "std_s0": 2.19, | |
| "mean_s0_ratio": 1.16, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2212, | |
| "s0": 10.11, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3266, | |
| "s0": 4.64, | |
| "s0_s1": 1.2, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "58": { | |
| "layer": 58, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0853, | |
| "q25_var64": 0.0797, | |
| "q75_var64": 0.0955, | |
| "mean_s0": 5.75, | |
| "std_s0": 2.08, | |
| "mean_s0_ratio": 1.15, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1874, | |
| "s0": 7.05, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3379, | |
| "s0": 4.87, | |
| "s0_s1": 1.13, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "59": { | |
| "layer": 59, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0846, | |
| "q25_var64": 0.0797, | |
| "q75_var64": 0.0928, | |
| "mean_s0": 5.58, | |
| "std_s0": 1.81, | |
| "mean_s0_ratio": 1.13, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.1663, | |
| "s0": 5.87, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3446, | |
| "s0": 5.4, | |
| "s0_s1": 1.12, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| }, | |
| "60": { | |
| "layer": 60, | |
| "is_moe": true, | |
| "routed_experts": { | |
| "median_var64": 0.0863, | |
| "q25_var64": 0.0811, | |
| "q75_var64": 0.0944, | |
| "mean_s0": 5.52, | |
| "std_s0": 1.19, | |
| "mean_s0_ratio": 1.1, | |
| "n_experts": 384 | |
| }, | |
| "shared_expert": { | |
| "var64": 0.2663, | |
| "s0": 6.47, | |
| "shape": [ | |
| 7168, | |
| 2048 | |
| ] | |
| }, | |
| "router": { | |
| "var64": 0.3667, | |
| "s0": 5.54, | |
| "s0_s1": 1.06, | |
| "shape": [ | |
| 384, | |
| 7168 | |
| ] | |
| } | |
| } | |
| } |