Lgr54HFi commited on
Commit
6cb7b4d
·
verified ·
1 Parent(s): 5b5a08d

fix: MoE intermediate_size not scaled for tiny — 158M→4M MoE params

Browse files

Bug: setdefault('moe_intermediate_size', ...) doesn't override the
1728 value from the full-scale config.json. Result: each expert is
SwiGLUMLP(256, 1728) = 1.3M params × 17 experts × 7 layers = 158M
params in MoE alone (70% of model), when it should be ~4M.

Fix: when a scale preset is active, force-override moe_intermediate_size
to intermediate_size // 4 (= 128 for tiny). Also force n_routed_experts
to 8 (not 16 from full config).

Param reduction: 227M → ~73M total. Step time ~3× faster."

Files changed (1) hide show
  1. chimera/training/common.py +14 -2
chimera/training/common.py CHANGED
@@ -64,12 +64,24 @@ def apply_standard_config_tweaks(config: dict[str, Any], *, scale: str, seq_len:
64
  "persistent_memory_slots": 16,
65
  "local_window_size": min(seq_len, 256),
66
  })
 
 
 
 
 
 
67
  moe_cfg = config.setdefault("backbone", {}).setdefault("moe", {})
68
  moe_cfg.setdefault("layers", [3, 7, 11, 15, 19, 23, 27])
69
- moe_cfg.setdefault("moe_intermediate_size", config["intermediate_size"] // 4)
70
- moe_cfg.setdefault("n_routed_experts", 8)
 
 
 
 
 
71
  moe_cfg.setdefault("n_shared_experts", 1)
72
  moe_cfg.setdefault("num_experts_per_tok", 2)
 
73
  config.setdefault("looping", {}).update({
74
  "enabled": True,
75
  "prelude": [0, 3],
 
64
  "persistent_memory_slots": 16,
65
  "local_window_size": min(seq_len, 256),
66
  })
67
+
68
+ # ── MoE config ──
69
+ # IMPORTANT: force-override MoE sizes when a scale preset is active.
70
+ # The full-scale config.json has moe_intermediate_size=1728 (for hidden=2560).
71
+ # Using setdefault() would silently keep 1728 even at tiny scale (hidden=256),
72
+ # resulting in 158M params in MoE alone (70% of model).
73
  moe_cfg = config.setdefault("backbone", {}).setdefault("moe", {})
74
  moe_cfg.setdefault("layers", [3, 7, 11, 15, 19, 23, 27])
75
+ if scale in DEFAULT_SCALE_PRESETS:
76
+ # Force scale-appropriate MoE sizes
77
+ moe_cfg["moe_intermediate_size"] = config["intermediate_size"] // 4
78
+ moe_cfg["n_routed_experts"] = 8
79
+ else:
80
+ moe_cfg.setdefault("moe_intermediate_size", config["intermediate_size"] // 4)
81
+ moe_cfg.setdefault("n_routed_experts", 8)
82
  moe_cfg.setdefault("n_shared_experts", 1)
83
  moe_cfg.setdefault("num_experts_per_tok", 2)
84
+
85
  config.setdefault("looping", {}).update({
86
  "enabled": True,
87
  "prelude": [0, 3],