Lgr54HFi
/

ch1mera

chimera51

custom_code

Model card Files Files and versions

xet

Community

Lgr54HFi commited on 13 days ago

Commit

0a7fd59

verified ·

1 Parent(s): f1df870

Upload inference.py

Browse files

Files changed (1) hide show

inference.py +45 -80

inference.py CHANGED Viewed

@@ -1,26 +1,5 @@
 #!/usr/bin/env python3
-"""Chimera 5.2 — CPU-first inference / text generation.
-Significant CPU-friendly changes vs the previous draft:
-* **KV-cache aware loop** — after the first forward pass we only feed the
-  new token plus the per-layer recurrent state into the model.  This makes
-  generation *O(T)* instead of *O(T²)*, the single biggest win for CPU
-  decoding.
-* **Pre-pack BitLinear weights** at startup so the first decoded token does
-  not pay the unpack/repack cost.
-* **Greedy fast path** (``temperature == 0``) skips softmax / sort entirely.
-* **Top-k constrained nucleus** — when both ``top_k`` and ``top_p`` are
-  used we sort the top-k slice only (not the full 200K vocabulary).
-* **Streaming output** — tokens are decoded incrementally so the first
-  bytes appear immediately.
-Usage::
-    python inference.py --checkpoint chimera_output/final/model.pt \\
-                       --prompt "Once upon a time" --max_tokens 200
-"""
 from __future__ import annotations
 import argparse
@@ -41,11 +20,9 @@ def _setup_cpu_runtime() -> None:
 _setup_cpu_runtime()
 import torch
 import torch.nn.functional as F
 try:
     torch.set_num_threads(int(os.environ.get("OMP_NUM_THREADS", os.cpu_count() or 4)))
     torch.set_num_interop_threads(int(os.environ.get("CHIMERA_INTEROP_THREADS", "1")))
@@ -57,9 +34,13 @@ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 from chimera import Chimera51ForCausalLM, ChimeraTokenizer
-# ---------------------------------------------------------------------------
-# Checkpoint loading
-# ---------------------------------------------------------------------------
 def load_model(checkpoint_path: str, device: str = "cpu"):
     print(f"[LOAD] Checkpoint: {checkpoint_path}")
@@ -77,38 +58,45 @@ def load_model(checkpoint_path: str, device: str = "cpu"):
     else:
         print("[LOAD] Config from checkpoint")
-    model = Chimera51ForCausalLM(config)
-    counts = model.count_parameters()
-    print(f"[LOAD] Params: {counts['total']:,}  (ternary: {counts['ternary']:,})")
     state = ckpt.get("model", ckpt)
-    # Reconcile vocab mismatches in either direction without crashing.
-    model_vocab = int(config.get("vocab_size", model.embed.num_embeddings))
-    ckpt_vocab = None
-    for key in ("embed.weight", "lm_head.weight"):
-        for sk, t in state.items():
-            if sk.endswith(key):
-                ckpt_vocab = int(t.shape[0])
-                break
-        if ckpt_vocab is not None:
-            break
-    if ckpt_vocab and ckpt_vocab != model_vocab:
-        print(f"[WARN] vocab mismatch ckpt={ckpt_vocab} cfg={model_vocab}; resizing")
-        with torch.no_grad():
-            old = model.embed.weight.data
-            new = torch.zeros(ckpt_vocab, old.shape[1], dtype=old.dtype, device=old.device)
-            new[:min(old.shape[0], ckpt_vocab)] = old[:min(old.shape[0], ckpt_vocab)]
-            model.embed = torch.nn.Embedding(ckpt_vocab, old.shape[1])
-            model.embed.weight.data = new
-            old_h = model.lm_head.weight.data
-            new_h = torch.zeros(ckpt_vocab, old_h.shape[1], dtype=old_h.dtype, device=old_h.device)
-            new_h[:min(old_h.shape[0], ckpt_vocab)] = old_h[:min(old_h.shape[0], ckpt_vocab)]
-            model.lm_head = torch.nn.Linear(old_h.shape[1], ckpt_vocab, bias=False)
-            model.lm_head.weight.data = new_h
         config["vocab_size"] = ckpt_vocab
     missing, unexpected = model.load_state_dict(state, strict=False)
     if missing:
         print(f"[WARN] Missing keys ({len(missing)}): {missing[:5]}...")
@@ -116,7 +104,7 @@ def load_model(checkpoint_path: str, device: str = "cpu"):
         print(f"[WARN] Unexpected keys ({len(unexpected)}): {unexpected[:5]}...")
     model.to(device).eval()
-    model.prepare_for_inference()  # pre-pack ternary weights
     step = ckpt.get("step", "?")
     best_loss = ckpt.get("best_loss")
@@ -127,22 +115,13 @@ def load_model(checkpoint_path: str, device: str = "cpu"):
     return model, config
-# ---------------------------------------------------------------------------
-# Sampling helpers
-# ---------------------------------------------------------------------------
 def _sample_next(logits: torch.Tensor, temperature: float, top_p: float, top_k: int
                  ) -> int:
-    """Return the next token id sampled from ``logits`` ([1, V] or [V])."""
     if logits.dim() == 1:
         logits = logits.unsqueeze(0)
-    # Greedy fast path.
     if temperature <= 0.0:
         return int(torch.argmax(logits, dim=-1).item())
     logits = logits / temperature
     if top_k and top_k > 0:
         k = min(top_k, logits.size(-1))
         cand_logits, cand_indices = torch.topk(logits, k, dim=-1)
@@ -157,7 +136,6 @@ def _sample_next(logits: torch.Tensor, temperature: float, top_p: float, top_k:
             return int(sorted_indices.gather(-1, torch.multinomial(probs, 1)).item())
         probs = F.softmax(cand_logits, dim=-1)
         return int(cand_indices.gather(-1, torch.multinomial(probs, 1)).item())
     if top_p < 1.0:
         sorted_logits, sorted_indices = torch.sort(logits, descending=True)
         cum_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
@@ -166,15 +144,10 @@ def _sample_next(logits: torch.Tensor, temperature: float, top_p: float, top_k:
         sorted_logits = sorted_logits.masked_fill(remove, float("-inf"))
         probs = F.softmax(sorted_logits, dim=-1)
         return int(sorted_indices.gather(-1, torch.multinomial(probs, 1)).item())
     probs = F.softmax(logits, dim=-1)
     return int(torch.multinomial(probs, 1).item())
-# ---------------------------------------------------------------------------
-# Generation loop
-# ---------------------------------------------------------------------------
 def generate(model: Chimera51ForCausalLM, tokenizer: ChimeraTokenizer,
              prompt: str, max_tokens: int = 100, temperature: float = 0.8,
              top_p: float = 0.9, top_k: int = 50, device: str = "cpu",
@@ -201,7 +174,6 @@ def generate(model: Chimera51ForCausalLM, tokenizer: ChimeraTokenizer,
     t0 = time.time()
     with torch.inference_mode(), autocast_ctx:
-        # Initial pass: feed the whole prompt and capture per-layer caches.
         out = model(input_ids, use_cache=True, logits_to_keep=1)
         caches = out.caches
         next_token = _sample_next(out.logits[:, -1, :].float(), temperature, top_p, top_k)
@@ -218,7 +190,6 @@ def generate(model: Chimera51ForCausalLM, tokenizer: ChimeraTokenizer,
                 break
             generated.append(next_token)
             if stream:
-                # Try to render only the newly produced text.
                 full = tokenizer.decode(generated, skip_special_tokens=False)
                 if full.startswith(decoded_so_far):
                     sys.stdout.write(full[len(decoded_so_far):])
@@ -241,15 +212,10 @@ def generate(model: Chimera51ForCausalLM, tokenizer: ChimeraTokenizer,
 class _nullctx:
     def __enter__(self):
         return self
     def __exit__(self, *args):
         return False
-# ---------------------------------------------------------------------------
-# CLI
-# ---------------------------------------------------------------------------
 def main() -> None:
     p = argparse.ArgumentParser(description="Chimera 5.2 CPU inference")
     p.add_argument("--checkpoint", default="chimera_output/final/model.pt")
@@ -286,8 +252,7 @@ def main() -> None:
     print("[WARM] Warmup forward...")
     with torch.inference_mode():
-        _ = model(torch.tensor([[tokenizer.eos_token_id]], device=args.device),
-                  logits_to_keep=1)
     print("[WARM] Done.")
     generate(

 #!/usr/bin/env python3
+"""Chimera 5.2 — CPU-first inference / text generation."""
 from __future__ import annotations
 import argparse
 _setup_cpu_runtime()
 import torch
 import torch.nn.functional as F
 try:
     torch.set_num_threads(int(os.environ.get("OMP_NUM_THREADS", os.cpu_count() or 4)))
     torch.set_num_interop_threads(int(os.environ.get("CHIMERA_INTEROP_THREADS", "1")))
 from chimera import Chimera51ForCausalLM, ChimeraTokenizer
+def _infer_dim(state, keys, idx):
+    for k in keys:
+        for sk, t in state.items():
+            if sk.endswith(k):
+                return int(t.shape[idx])
+    return None
 def load_model(checkpoint_path: str, device: str = "cpu"):
     print(f"[LOAD] Checkpoint: {checkpoint_path}")
     else:
         print("[LOAD] Config from checkpoint")
+    # ---- reconcile structural dims from checkpoint weights BEFORE model build ----
     state = ckpt.get("model", ckpt)
+    ckpt_vocab = _infer_dim(state, ["embed.weight", "lm_head.weight"], 0)
+    if ckpt_vocab and ckpt_vocab != config.get("vocab_size", ckpt_vocab):
+        print(f"[WARN] vocab_size mismatch ckpt={ckpt_vocab} cfg={config.get('vocab_size')}; resizing")
         config["vocab_size"] = ckpt_vocab
+    ckpt_hidden = _infer_dim(state, ["embed.weight", "lm_head.weight"], 1)
+    if ckpt_hidden and ckpt_hidden != config.get("hidden_size", ckpt_hidden):
+        print(f"[WARN] hidden_size mismatch ckpt={ckpt_hidden} cfg={config.get('hidden_size')}; resizing")
+        config["hidden_size"] = ckpt_hidden
+    # head_dim from any attention q_proj (shape [num_heads*head_dim, hidden_size])
+    ckpt_q = _infer_dim(state, ["layers.0.attn.q_proj.weight", "layers.1.attn.q_proj.weight"], 0)
+    if ckpt_q and ckpt_hidden:
+        head_dim_guess = config.get("head_dim")
+        num_heads_guess = config.get("num_heads", 40)
+        if head_dim_guess and ckpt_q != num_heads_guess * head_dim_guess:
+            # mismatch — try to infer actual head_dim from q_proj / num_heads
+            for nh in [1, 2, 4, 5, 8, 10, 16, 20, 32, 40, 64]:
+                if ckpt_q % nh == 0:
+                    inferred_hd = ckpt_q // nh
+                    if ckpt_hidden % inferred_hd == 0:
+                        config["num_heads"] = nh
+                        config["head_dim"] = inferred_hd
+                        print(f"[WARN] auto-inferred num_heads={nh}, head_dim={inferred_hd} from q_proj={ckpt_q}")
+                        break
+    ckpt_inter = _infer_dim(state, ["layers.0.ffn.gate_proj.weight", "layers.1.ffn.gate_proj.weight"], 0)
+    if ckpt_inter and ckpt_inter != config.get("intermediate_size", ckpt_inter):
+        print(f"[WARN] intermediate_size mismatch ckpt={ckpt_inter} cfg={config.get('intermediate_size')}; resizing")
+        config["intermediate_size"] = ckpt_inter
+    # ---------------------------------------------------------------------------
+    model = Chimera51ForCausalLM(config)
+    counts = model.count_parameters()
+    print(f"[LOAD] Params: {counts['total']:,}  (ternary: {counts['ternary']:,})")
     missing, unexpected = model.load_state_dict(state, strict=False)
     if missing:
         print(f"[WARN] Missing keys ({len(missing)}): {missing[:5]}...")
         print(f"[WARN] Unexpected keys ({len(unexpected)}): {unexpected[:5]}...")
     model.to(device).eval()
+    model.prepare_for_inference()
     step = ckpt.get("step", "?")
     best_loss = ckpt.get("best_loss")
     return model, config
 def _sample_next(logits: torch.Tensor, temperature: float, top_p: float, top_k: int
                  ) -> int:
     if logits.dim() == 1:
         logits = logits.unsqueeze(0)
     if temperature <= 0.0:
         return int(torch.argmax(logits, dim=-1).item())
     logits = logits / temperature
     if top_k and top_k > 0:
         k = min(top_k, logits.size(-1))
         cand_logits, cand_indices = torch.topk(logits, k, dim=-1)
             return int(sorted_indices.gather(-1, torch.multinomial(probs, 1)).item())
         probs = F.softmax(cand_logits, dim=-1)
         return int(cand_indices.gather(-1, torch.multinomial(probs, 1)).item())
     if top_p < 1.0:
         sorted_logits, sorted_indices = torch.sort(logits, descending=True)
         cum_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
         sorted_logits = sorted_logits.masked_fill(remove, float("-inf"))
         probs = F.softmax(sorted_logits, dim=-1)
         return int(sorted_indices.gather(-1, torch.multinomial(probs, 1)).item())
     probs = F.softmax(logits, dim=-1)
     return int(torch.multinomial(probs, 1).item())
 def generate(model: Chimera51ForCausalLM, tokenizer: ChimeraTokenizer,
              prompt: str, max_tokens: int = 100, temperature: float = 0.8,
              top_p: float = 0.9, top_k: int = 50, device: str = "cpu",
     t0 = time.time()
     with torch.inference_mode(), autocast_ctx:
         out = model(input_ids, use_cache=True, logits_to_keep=1)
         caches = out.caches
         next_token = _sample_next(out.logits[:, -1, :].float(), temperature, top_p, top_k)
                 break
             generated.append(next_token)
             if stream:
                 full = tokenizer.decode(generated, skip_special_tokens=False)
                 if full.startswith(decoded_so_far):
                     sys.stdout.write(full[len(decoded_so_far):])
 class _nullctx:
     def __enter__(self):
         return self
     def __exit__(self, *args):
         return False
 def main() -> None:
     p = argparse.ArgumentParser(description="Chimera 5.2 CPU inference")
     p.add_argument("--checkpoint", default="chimera_output/final/model.pt")
     print("[WARM] Warmup forward...")
     with torch.inference_mode():
+        _ = model(torch.tensor([[tokenizer.eos_token_id]], device=args.device), logits_to_keep=1)
     print("[WARM] Done.")
     generate(