File size: 3,109 Bytes
11c11f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from __future__ import annotations

import copy
import json
from pathlib import Path
from typing import Any, Mapping

from .paths import DEFAULT_CONFIG_PATH


def load_config(path: str | Path | None = None, overrides: Mapping[str, Any] | None = None) -> dict:
    """Load a Chimera JSON config and apply shallow dotted-key overrides."""
    if path is None:
        path = DEFAULT_CONFIG_PATH
    with open(path, "r", encoding="utf-8") as fh:
        cfg = json.load(fh)
    if overrides:
        cfg = copy.deepcopy(cfg)
        for key, value in overrides.items():
            cur = cfg
            parts = str(key).split(".")
            for part in parts[:-1]:
                cur = cur.setdefault(part, {})
            cur[parts[-1]] = value
    return cfg


def scale_config(config: dict, scale: str = "base") -> dict:
    """Return a safe CPU-scaled copy while preserving feature flags.

    The uploaded Chimera config targets a large model.  These presets keep all
    modules wired but resize dimensions so tests/fine-tuning fit commodity CPU
    memory (including 16 GB DDR5 machines).
    """
    cfg = copy.deepcopy(config)
    presets = {
        "nano": dict(hidden_size=128, intermediate_size=344, num_hidden_layers=4, num_heads=4, head_dim=32, vocab_size=min(cfg.get("vocab_size", 32000), 8192)),
        "tiny": dict(hidden_size=256, intermediate_size=688, num_hidden_layers=6, num_heads=4, head_dim=64, vocab_size=min(cfg.get("vocab_size", 32000), 32768)),
        "small": dict(hidden_size=512, intermediate_size=1376, num_hidden_layers=8, num_heads=8, head_dim=64, vocab_size=min(cfg.get("vocab_size", 32000), 65536)),
        "base": {},
    }
    if scale not in presets:
        raise ValueError(f"unknown scale {scale!r}; choose {sorted(presets)}")
    cfg.update(presets[scale])
    h = cfg["hidden_size"]
    cfg["num_heads"] = max(1, min(cfg.get("num_heads", 4), h // max(1, cfg.get("head_dim", 64))))
    cfg["head_dim"] = h // cfg["num_heads"]
    cfg.setdefault("backbone", {}).setdefault("moe", {})
    moe = cfg["backbone"]["moe"]
    moe["layers"] = [i for i in moe.get("layers", []) if i < cfg["num_hidden_layers"]]
    moe["n_routed_experts"] = min(int(moe.get("n_routed_experts", 4)), 4 if scale in {"nano", "tiny"} else 8)
    moe["n_shared_experts"] = min(int(moe.get("n_shared_experts", 1)), 1)
    moe["num_experts_per_tok"] = min(int(moe.get("num_experts_per_tok", 2)), moe["n_routed_experts"])
    moe["moe_intermediate_size"] = min(int(moe.get("moe_intermediate_size", h * 2)), max(64, cfg["intermediate_size"] // 2))
    loop = cfg.setdefault("looping", {})
    if cfg["num_hidden_layers"] < 8:
        loop["enabled"] = False
    else:
        loop["prelude"] = [0, min(1, cfg["num_hidden_layers"] - 1)]
        loop["loop"] = [2, max(2, cfg["num_hidden_layers"] - 3)]
        loop["coda"] = [max(0, cfg["num_hidden_layers"] - 2), cfg["num_hidden_layers"] - 1]
    cfg.setdefault("span_inference", {})["enabled"] = bool(cfg.get("span_inference", {}).get("enabled", True))
    return cfg


def tiny_config() -> dict:
    return scale_config(load_config(), "nano")