Lgr54HFi
/

ch1mera

chimera51

custom_code

Model card Files Files and versions

xet

Community

Lgr54HFi commited on 28 days ago

Commit

bc0ec84

verified ·

1 Parent(s): 0a7fd59

Upload inference.py

Browse files

Files changed (1) hide show

inference.py +87 -43

inference.py CHANGED Viewed

@@ -1,5 +1,8 @@
 #!/usr/bin/env python3
-"""Chimera 5.2 — CPU-first inference / text generation."""
 from __future__ import annotations
 import argparse
@@ -7,6 +10,7 @@ import json
 import os
 import sys
 import time
 def _setup_cpu_runtime() -> None:
@@ -34,13 +38,36 @@ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 from chimera import Chimera51ForCausalLM, ChimeraTokenizer
-def _infer_dim(state, keys, idx):
-    for k in keys:
-        for sk, t in state.items():
-            if sk.endswith(k):
-                return int(t.shape[idx])
-    return None
 def load_model(checkpoint_path: str, device: str = "cpu"):
     print(f"[LOAD] Checkpoint: {checkpoint_path}")
@@ -58,46 +85,51 @@ def load_model(checkpoint_path: str, device: str = "cpu"):
     else:
         print("[LOAD] Config from checkpoint")
-    # ---- reconcile structural dims from checkpoint weights BEFORE model build ----
-    state = ckpt.get("model", ckpt)
-    ckpt_vocab = _infer_dim(state, ["embed.weight", "lm_head.weight"], 0)
-    if ckpt_vocab and ckpt_vocab != config.get("vocab_size", ckpt_vocab):
-        print(f"[WARN] vocab_size mismatch ckpt={ckpt_vocab} cfg={config.get('vocab_size')}; resizing")
-        config["vocab_size"] = ckpt_vocab
-    ckpt_hidden = _infer_dim(state, ["embed.weight", "lm_head.weight"], 1)
-    if ckpt_hidden and ckpt_hidden != config.get("hidden_size", ckpt_hidden):
-        print(f"[WARN] hidden_size mismatch ckpt={ckpt_hidden} cfg={config.get('hidden_size')}; resizing")
-        config["hidden_size"] = ckpt_hidden
-    # head_dim from any attention q_proj (shape [num_heads*head_dim, hidden_size])
-    ckpt_q = _infer_dim(state, ["layers.0.attn.q_proj.weight", "layers.1.attn.q_proj.weight"], 0)
-    if ckpt_q and ckpt_hidden:
-        head_dim_guess = config.get("head_dim")
-        num_heads_guess = config.get("num_heads", 40)
-        if head_dim_guess and ckpt_q != num_heads_guess * head_dim_guess:
-            # mismatch — try to infer actual head_dim from q_proj / num_heads
-            for nh in [1, 2, 4, 5, 8, 10, 16, 20, 32, 40, 64]:
-                if ckpt_q % nh == 0:
-                    inferred_hd = ckpt_q // nh
-                    if ckpt_hidden % inferred_hd == 0:
-                        config["num_heads"] = nh
-                        config["head_dim"] = inferred_hd
-                        print(f"[WARN] auto-inferred num_heads={nh}, head_dim={inferred_hd} from q_proj={ckpt_q}")
-                        break
-    ckpt_inter = _infer_dim(state, ["layers.0.ffn.gate_proj.weight", "layers.1.ffn.gate_proj.weight"], 0)
-    if ckpt_inter and ckpt_inter != config.get("intermediate_size", ckpt_inter):
-        print(f"[WARN] intermediate_size mismatch ckpt={ckpt_inter} cfg={config.get('intermediate_size')}; resizing")
-        config["intermediate_size"] = ckpt_inter
-    # ---------------------------------------------------------------------------
     model = Chimera51ForCausalLM(config)
     counts = model.count_parameters()
     print(f"[LOAD] Params: {counts['total']:,}  (ternary: {counts['ternary']:,})")
-    missing, unexpected = model.load_state_dict(state, strict=False)
     if missing:
         print(f"[WARN] Missing keys ({len(missing)}): {missing[:5]}...")
     if unexpected:
@@ -115,6 +147,10 @@ def load_model(checkpoint_path: str, device: str = "cpu"):
     return model, config
 def _sample_next(logits: torch.Tensor, temperature: float, top_p: float, top_k: int
                  ) -> int:
     if logits.dim() == 1:
@@ -148,6 +184,10 @@ def _sample_next(logits: torch.Tensor, temperature: float, top_p: float, top_k:
     return int(torch.multinomial(probs, 1).item())
 def generate(model: Chimera51ForCausalLM, tokenizer: ChimeraTokenizer,
              prompt: str, max_tokens: int = 100, temperature: float = 0.8,
              top_p: float = 0.9, top_k: int = 50, device: str = "cpu",
@@ -216,6 +256,10 @@ class _nullctx:
         return False
 def main() -> None:
     p = argparse.ArgumentParser(description="Chimera 5.2 CPU inference")
     p.add_argument("--checkpoint", default="chimera_output/final/model.pt")

 #!/usr/bin/env python3
+"""Chimera 5.2 — CPU-first inference / text generation.
+Config is source of truth. Checkpoint weights are resized to match the model.
+"""
 from __future__ import annotations
 import argparse
 import os
 import sys
 import time
+from typing import Dict, Tuple
 def _setup_cpu_runtime() -> None:
 from chimera import Chimera51ForCausalLM, ChimeraTokenizer
+# ---------------------------------------------------------------------------
+# Resize helpers: checkpoint weights -> model architecture (config is truth)
+# ---------------------------------------------------------------------------
+@torch.no_grad()
+def _resize_1d(w: torch.Tensor, target: int) -> torch.Tensor:
+    out = torch.ones(target, dtype=w.dtype, device=w.device)
+    n = min(w.numel(), target)
+    out[:n] = w[:n]
+    return out
+@torch.no_grad()
+def _resize_2d(w: torch.Tensor, target_shape: Tuple[int, int]) -> torch.Tensor:
+    to, ti = target_shape
+    so, si = w.shape
+    if (so, si) == (to, ti):
+        return w
+    out = torch.empty((to, ti), dtype=w.dtype, device=w.device)
+    std = float(w.std(unbiased=False).item()) if w.numel() > 1 else 0.02
+    std = max(min(std, 0.2), 1e-4)
+    out.normal_(mean=0.0, std=std)
+    ro, ci = min(so, to), min(si, ti)
+    out[:ro, :ci] = w[:ro, :ci]
+    return out
+# ---------------------------------------------------------------------------
+# Checkpoint loading
+# ---------------------------------------------------------------------------
 def load_model(checkpoint_path: str, device: str = "cpu"):
     print(f"[LOAD] Checkpoint: {checkpoint_path}")
     else:
         print("[LOAD] Config from checkpoint")
     model = Chimera51ForCausalLM(config)
     counts = model.count_parameters()
     print(f"[LOAD] Params: {counts['total']:,}  (ternary: {counts['ternary']:,})")
+    state = ckpt.get("model", ckpt)
+    model_state = model.state_dict()
+    # Config is source of truth: resize checkpoint tensors to match model.
+    resized: Dict[str, torch.Tensor] = {}
+    for k, v in state.items():
+        if k in model_state:
+            expected = model_state[k].shape
+            if v.shape != expected:
+                print(f"[WARN] resizing {k}: {tuple(v.shape)} -> {tuple(expected)}")
+                if v.ndim == 1:
+                    v = _resize_1d(v, expected[0])
+                elif v.ndim == 2:
+                    v = _resize_2d(v, expected)
+                else:
+                    print(f"[SKIP] {k}: cannot resize {v.ndim}D tensor")
+                    continue
+            resized[k] = v
+        else:
+            resized[k] = v
+    # Vocab reconciliation: if vocab mismatch, re-init embed + lm_head.
+    model_vocab = int(config.get("vocab_size", model.embed.num_embeddings))
+    if "embed.weight" in resized:
+        ckpt_vocab = int(resized["embed.weight"].shape[0])
+        if ckpt_vocab != model_vocab:
+            print(f"[WARN] vocab mismatch ckpt={ckpt_vocab} cfg={model_vocab}; re-init embed+head")
+            with torch.no_grad():
+                old = model.embed.weight.data
+                new = torch.zeros(ckpt_vocab, old.shape[1], dtype=old.dtype, device=old.device)
+                new[:min(old.shape[0], ckpt_vocab)] = old[:min(old.shape[0], ckpt_vocab)]
+                model.embed = torch.nn.Embedding(ckpt_vocab, old.shape[1])
+                model.embed.weight.data = new
+                old_h = model.lm_head.weight.data
+                new_h = torch.zeros(ckpt_vocab, old_h.shape[1], dtype=old_h.dtype, device=old_h.device)
+                new_h[:min(old_h.shape[0], ckpt_vocab)] = old_h[:min(old_h.shape[0], ckpt_vocab)]
+                model.lm_head = torch.nn.Linear(old_h.shape[1], ckpt_vocab, bias=False)
+                model.lm_head.weight.data = new_h
+            config["vocab_size"] = ckpt_vocab
+    missing, unexpected = model.load_state_dict(resized, strict=False)
     if missing:
         print(f"[WARN] Missing keys ({len(missing)}): {missing[:5]}...")
     if unexpected:
     return model, config
+# ---------------------------------------------------------------------------
+# Sampling helpers
+# ---------------------------------------------------------------------------
 def _sample_next(logits: torch.Tensor, temperature: float, top_p: float, top_k: int
                  ) -> int:
     if logits.dim() == 1:
     return int(torch.multinomial(probs, 1).item())
+# ---------------------------------------------------------------------------
+# Generation loop
+# ---------------------------------------------------------------------------
 def generate(model: Chimera51ForCausalLM, tokenizer: ChimeraTokenizer,
              prompt: str, max_tokens: int = 100, temperature: float = 0.8,
              top_p: float = 0.9, top_k: int = 50, device: str = "cpu",
         return False
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
 def main() -> None:
     p = argparse.ArgumentParser(description="Chimera 5.2 CPU inference")
     p.add_argument("--checkpoint", default="chimera_output/final/model.pt")