fix: MoE intermediate_size not scaled for tiny — 158M→4M MoE params
Browse filesBug: setdefault('moe_intermediate_size', ...) doesn't override the
1728 value from the full-scale config.json. Result: each expert is
SwiGLUMLP(256, 1728) = 1.3M params × 17 experts × 7 layers = 158M
params in MoE alone (70% of model), when it should be ~4M.
Fix: when a scale preset is active, force-override moe_intermediate_size
to intermediate_size // 4 (= 128 for tiny). Also force n_routed_experts
to 8 (not 16 from full config).
Param reduction: 227M → ~73M total. Step time ~3× faster."
- chimera/training/common.py +14 -2
chimera/training/common.py
CHANGED
|
@@ -64,12 +64,24 @@ def apply_standard_config_tweaks(config: dict[str, Any], *, scale: str, seq_len:
|
|
| 64 |
"persistent_memory_slots": 16,
|
| 65 |
"local_window_size": min(seq_len, 256),
|
| 66 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
moe_cfg = config.setdefault("backbone", {}).setdefault("moe", {})
|
| 68 |
moe_cfg.setdefault("layers", [3, 7, 11, 15, 19, 23, 27])
|
| 69 |
-
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
moe_cfg.setdefault("n_shared_experts", 1)
|
| 72 |
moe_cfg.setdefault("num_experts_per_tok", 2)
|
|
|
|
| 73 |
config.setdefault("looping", {}).update({
|
| 74 |
"enabled": True,
|
| 75 |
"prelude": [0, 3],
|
|
|
|
| 64 |
"persistent_memory_slots": 16,
|
| 65 |
"local_window_size": min(seq_len, 256),
|
| 66 |
})
|
| 67 |
+
|
| 68 |
+
# ── MoE config ──
|
| 69 |
+
# IMPORTANT: force-override MoE sizes when a scale preset is active.
|
| 70 |
+
# The full-scale config.json has moe_intermediate_size=1728 (for hidden=2560).
|
| 71 |
+
# Using setdefault() would silently keep 1728 even at tiny scale (hidden=256),
|
| 72 |
+
# resulting in 158M params in MoE alone (70% of model).
|
| 73 |
moe_cfg = config.setdefault("backbone", {}).setdefault("moe", {})
|
| 74 |
moe_cfg.setdefault("layers", [3, 7, 11, 15, 19, 23, 27])
|
| 75 |
+
if scale in DEFAULT_SCALE_PRESETS:
|
| 76 |
+
# Force scale-appropriate MoE sizes
|
| 77 |
+
moe_cfg["moe_intermediate_size"] = config["intermediate_size"] // 4
|
| 78 |
+
moe_cfg["n_routed_experts"] = 8
|
| 79 |
+
else:
|
| 80 |
+
moe_cfg.setdefault("moe_intermediate_size", config["intermediate_size"] // 4)
|
| 81 |
+
moe_cfg.setdefault("n_routed_experts", 8)
|
| 82 |
moe_cfg.setdefault("n_shared_experts", 1)
|
| 83 |
moe_cfg.setdefault("num_experts_per_tok", 2)
|
| 84 |
+
|
| 85 |
config.setdefault("looping", {}).update({
|
| 86 |
"enabled": True,
|
| 87 |
"prelude": [0, 3],
|