| from __future__ import annotations |
|
|
| import copy |
| import json |
| from pathlib import Path |
| from typing import Any, Mapping |
|
|
|
|
| def load_config(path: str | Path | None = None, overrides: Mapping[str, Any] | None = None) -> dict: |
| """Load a Chimera JSON config and apply shallow dotted-key overrides.""" |
| if path is None: |
| path = Path(__file__).resolve().parents[1] / "config.json" |
| with open(path, "r", encoding="utf-8") as fh: |
| cfg = json.load(fh) |
| if overrides: |
| cfg = copy.deepcopy(cfg) |
| for key, value in overrides.items(): |
| cur = cfg |
| parts = str(key).split(".") |
| for part in parts[:-1]: |
| cur = cur.setdefault(part, {}) |
| cur[parts[-1]] = value |
| return cfg |
|
|
|
|
| def scale_config(config: dict, scale: str = "base") -> dict: |
| """Return a safe CPU-scaled copy while preserving feature flags. |
| |
| The uploaded Chimera config targets a large model. These presets keep all |
| modules wired but resize dimensions so tests/fine-tuning fit commodity CPU |
| memory (including 16 GB DDR5 machines). |
| """ |
| cfg = copy.deepcopy(config) |
| presets = { |
| "nano": dict(hidden_size=128, intermediate_size=344, num_hidden_layers=4, num_heads=4, head_dim=32, vocab_size=min(cfg.get("vocab_size", 32000), 8192)), |
| "tiny": dict(hidden_size=256, intermediate_size=688, num_hidden_layers=6, num_heads=4, head_dim=64, vocab_size=min(cfg.get("vocab_size", 32000), 32768)), |
| "small": dict(hidden_size=512, intermediate_size=1376, num_hidden_layers=8, num_heads=8, head_dim=64, vocab_size=min(cfg.get("vocab_size", 32000), 65536)), |
| "base": {}, |
| } |
| if scale not in presets: |
| raise ValueError(f"unknown scale {scale!r}; choose {sorted(presets)}") |
| cfg.update(presets[scale]) |
| h = cfg["hidden_size"] |
| cfg["num_heads"] = max(1, min(cfg.get("num_heads", 4), h // max(1, cfg.get("head_dim", 64)))) |
| cfg["head_dim"] = h // cfg["num_heads"] |
| cfg.setdefault("backbone", {}).setdefault("moe", {}) |
| moe = cfg["backbone"]["moe"] |
| moe["layers"] = [i for i in moe.get("layers", []) if i < cfg["num_hidden_layers"]] |
| moe["n_routed_experts"] = min(int(moe.get("n_routed_experts", 4)), 4 if scale in {"nano", "tiny"} else 8) |
| moe["n_shared_experts"] = min(int(moe.get("n_shared_experts", 1)), 1) |
| moe["num_experts_per_tok"] = min(int(moe.get("num_experts_per_tok", 2)), moe["n_routed_experts"]) |
| moe["moe_intermediate_size"] = min(int(moe.get("moe_intermediate_size", h * 2)), max(64, cfg["intermediate_size"] // 2)) |
| loop = cfg.setdefault("looping", {}) |
| if cfg["num_hidden_layers"] < 8: |
| loop["enabled"] = False |
| else: |
| loop["prelude"] = [0, min(1, cfg["num_hidden_layers"] - 1)] |
| loop["loop"] = [2, max(2, cfg["num_hidden_layers"] - 3)] |
| loop["coda"] = [max(0, cfg["num_hidden_layers"] - 2), cfg["num_hidden_layers"] - 1] |
| cfg.setdefault("span_inference", {})["enabled"] = bool(cfg.get("span_inference", {}).get("enabled", True)) |
| return cfg |
|
|
|
|
| def tiny_config() -> dict: |
| return scale_config(load_config(), "nano") |
|
|