GitHub Actions commited on
Commit
cc6274a
·
0 Parent(s):

Auto-deploy from GitHub Actions

Browse files
Files changed (50) hide show
  1. README.md +38 -0
  2. app.py +2376 -0
  3. requirements.txt +2 -0
  4. src/llm_cal/__init__.py +6 -0
  5. src/llm_cal/architecture/__init__.py +0 -0
  6. src/llm_cal/architecture/detector.py +134 -0
  7. src/llm_cal/architecture/formulas/__init__.py +0 -0
  8. src/llm_cal/architecture/formulas/kv_cache.py +145 -0
  9. src/llm_cal/architecture/formulas/weight.py +133 -0
  10. src/llm_cal/architecture/profile.py +97 -0
  11. src/llm_cal/architecture/traits.py +150 -0
  12. src/llm_cal/benchmark/__init__.py +0 -0
  13. src/llm_cal/benchmark/dataset.yaml +203 -0
  14. src/llm_cal/benchmark/runner.py +232 -0
  15. src/llm_cal/cli.py +207 -0
  16. src/llm_cal/command_generator/__init__.py +0 -0
  17. src/llm_cal/command_generator/sglang.py +50 -0
  18. src/llm_cal/command_generator/vllm.py +55 -0
  19. src/llm_cal/common/__init__.py +0 -0
  20. src/llm_cal/common/i18n.py +421 -0
  21. src/llm_cal/common/yaml_loader.py +48 -0
  22. src/llm_cal/core/__init__.py +0 -0
  23. src/llm_cal/core/cache.py +97 -0
  24. src/llm_cal/core/evaluator.py +375 -0
  25. src/llm_cal/core/explain.py +504 -0
  26. src/llm_cal/engine_compat/__init__.py +0 -0
  27. src/llm_cal/engine_compat/loader.py +118 -0
  28. src/llm_cal/engine_compat/matrix.yaml +512 -0
  29. src/llm_cal/fleet/__init__.py +0 -0
  30. src/llm_cal/fleet/planner.py +282 -0
  31. src/llm_cal/hardware/__init__.py +0 -0
  32. src/llm_cal/hardware/gpu_database.yaml +613 -0
  33. src/llm_cal/hardware/loader.py +77 -0
  34. src/llm_cal/llm_review/__init__.py +0 -0
  35. src/llm_cal/llm_review/reviewer.py +218 -0
  36. src/llm_cal/model_source/__init__.py +0 -0
  37. src/llm_cal/model_source/auth.py +33 -0
  38. src/llm_cal/model_source/base.py +58 -0
  39. src/llm_cal/model_source/huggingface.py +118 -0
  40. src/llm_cal/model_source/modelscope.py +229 -0
  41. src/llm_cal/output/__init__.py +0 -0
  42. src/llm_cal/output/formatter.py +665 -0
  43. src/llm_cal/output/labels.py +46 -0
  44. src/llm_cal/performance/__init__.py +0 -0
  45. src/llm_cal/performance/compute.py +233 -0
  46. src/llm_cal/performance/concurrency.py +132 -0
  47. src/llm_cal/weight_analyzer/__init__.py +146 -0
  48. src/llm_cal/weight_analyzer/fingerprint.py +292 -0
  49. src/llm_cal/weight_analyzer/reconciler.py +247 -0
  50. src/llm_cal/weight_analyzer/safetensors_reader.py +163 -0
README.md ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: llm-cal
3
+ emoji: 🧮
4
+ colorFrom: indigo
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 6.13.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ short_description: LLM inference sizing — honest, architecture-aware
12
+ ---
13
+
14
+ # llm-cal — LLM inference hardware calculator
15
+
16
+ Web UI for [`llm-cal`](https://github.com/FlyTOmeLight/llm-cal). Pick a model, pick a GPU, get a hardware plan.
17
+
18
+ Architecture-aware (MLA, NSA, CSA+HCA, MoE, sliding window). Engine-aware (vLLM, SGLang). Honest-labeled — every number carries a provenance tag (`[verified]` / `[inferred]` / `[estimated]` / `[cited]` / `[unverified]` / `[unknown]`).
19
+
20
+ ## The story this Space exists to tell
21
+
22
+ `gpu_poor` reports DeepSeek-V4-Flash as 284 GB by assuming pure FP8. The real safetensors weight is 160 GB — it ships an FP4+FP8 mixed pack. `llm-cal` reads the actual on-disk dtype (per-tensor metadata + MX block-scaled scale tensors) and gets 160.01 GB at **0.2% error**.
23
+
24
+ That's the whole pitch.
25
+
26
+ ## Local
27
+
28
+ ```bash
29
+ pip install llm-cal gradio
30
+ python app.py
31
+ ```
32
+
33
+ ## Links
34
+
35
+ - [GitHub repo](https://github.com/FlyTOmeLight/llm-cal)
36
+ - [Full docs](https://flytomelight.github.io/llm-cal/)
37
+ - [Methodology](https://flytomelight.github.io/llm-cal/methodology/) — every formula's primary source
38
+ - [Pre-rendered model pages](https://flytomelight.github.io/llm-cal/models/) — popular model × GPU combos
app.py ADDED
@@ -0,0 +1,2376 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """llm-cal Gradio web app — deploys to HuggingFace Spaces.
2
+
3
+ User journey:
4
+ 1. Type a HuggingFace model id (or pick from examples)
5
+ 2. Choose target GPU
6
+ 3. Hit Calculate
7
+ 4. Read the same `--explain`-quality output the CLI gives you, but in a browser
8
+ and shareable via URL parameters.
9
+
10
+ The whole compute is the existing Python `Evaluator`. No new logic.
11
+
12
+ Local run:
13
+ python web/app.py
14
+ HF Spaces:
15
+ This file is the entry point Spaces expects. requirements.txt sits next to it.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import sys
21
+ from pathlib import Path
22
+
23
+ # Ensure src/ is importable. Two layouts supported:
24
+ # 1. Local dev: /repo/web/app.py + /repo/src/ (parent.parent / src)
25
+ # 2. HF Space: /space/app.py + /space/src/ (parent / src)
26
+ # The deploy workflow flattens layout 1 → layout 2 when pushing to the Space.
27
+ _HERE = Path(__file__).resolve().parent
28
+ for _candidate in (_HERE / "src", _HERE.parent / "src"):
29
+ if _candidate.exists():
30
+ sys.path.insert(0, str(_candidate))
31
+ break
32
+
33
+ import os # noqa: E402
34
+
35
+ import gradio as gr # noqa: E402
36
+
37
+ from llm_cal.common.i18n import set_locale, t # noqa: E402
38
+ from llm_cal.core.evaluator import EvaluationReport, Evaluator # noqa: E402
39
+ from llm_cal.core.explain import ExplainEntry # noqa: E402
40
+ from llm_cal.core.explain import build as build_explain # noqa: E402
41
+ from llm_cal.hardware.loader import load_database # noqa: E402
42
+ from llm_cal.llm_review.reviewer import run_review # noqa: E402
43
+ from llm_cal.model_source.huggingface import HuggingFaceSource # noqa: E402
44
+ from llm_cal.model_source.modelscope import ModelScopeSource # noqa: E402
45
+
46
+ # ---------------------------------------------------------------------------
47
+ # Static data the UI needs
48
+
49
+ _DB = load_database()
50
+
51
+
52
+ def _classify_vendor(gpu_id: str) -> tuple[str, str]:
53
+ """Map a GPU id to (vendor_en, vendor_zh).
54
+
55
+ Vendor isn't in the YAML schema (yet), so derive from the id prefix.
56
+ """
57
+ gid = gpu_id.upper()
58
+ if gid in {"B200", "GB200", "H100", "H800", "H200", "H20", "GH200"} or gid.startswith(
59
+ ("L4", "L40", "RTX", "A10", "A100", "A40", "V100", "T4")
60
+ ):
61
+ return ("NVIDIA", "NVIDIA")
62
+ if gid.startswith("MI"):
63
+ return ("AMD", "AMD")
64
+ if gid.startswith("GAUDI"):
65
+ return ("Intel Habana", "英特尔 Habana")
66
+ if gid.startswith("910") or gid.startswith("ATLAS"):
67
+ return ("Huawei Ascend", "华为昇腾")
68
+ if gid.startswith("MXC"):
69
+ return ("MetaX 沐曦", "沐曦 MetaX")
70
+ if gid.startswith("KUNLUN"):
71
+ return ("Kunlunxin 昆仑芯", "昆仑芯 Kunlunxin")
72
+ if gid.startswith("BR"):
73
+ return ("Biren 壁仞", "壁仞 Biren")
74
+ if gid.startswith("BI-"):
75
+ return ("Iluvatar 天数智芯", "天数智芯 Iluvatar")
76
+ if gid.startswith(("MR-", "MTT")):
77
+ return ("Moore Threads 摩尔线程", "摩尔线程 Moore Threads")
78
+ if gid.startswith("MLU"):
79
+ return ("Cambricon 寒武纪", "寒武纪 Cambricon")
80
+ if gid.startswith("HYGON"):
81
+ return ("Hygon 海光", "海光 Hygon")
82
+ return ("Other", "其他")
83
+
84
+
85
+ # Stable vendor display order
86
+ _VENDOR_ORDER = [
87
+ "NVIDIA",
88
+ "AMD",
89
+ "Intel Habana",
90
+ "Huawei Ascend",
91
+ "MetaX 沐曦",
92
+ "Kunlunxin 昆仑芯",
93
+ "Biren 壁仞",
94
+ "Iluvatar 天数智芯",
95
+ "Moore Threads 摩尔线程",
96
+ "Cambricon 寒武纪",
97
+ "Hygon 海光",
98
+ "Other",
99
+ ]
100
+
101
+
102
+ def _build_vendor_index() -> dict[str, list[str]]:
103
+ """vendor_en -> sorted list of GPU ids"""
104
+ out: dict[str, list[str]] = {v: [] for v in _VENDOR_ORDER}
105
+ for g in _DB.gpus:
106
+ v_en, _ = _classify_vendor(g.id)
107
+ out.setdefault(v_en, []).append(g.id)
108
+ for v in out:
109
+ out[v].sort()
110
+ # Drop empty buckets
111
+ return {v: ids for v, ids in out.items() if ids}
112
+
113
+
114
+ _VENDOR_TO_GPUS = _build_vendor_index()
115
+ VENDOR_CHOICES_EN: list[str] = list(_VENDOR_TO_GPUS.keys())
116
+ DEFAULT_VENDOR = "NVIDIA"
117
+ DEFAULT_GPU = "H800"
118
+
119
+ EXAMPLE_MODELS: list[tuple[str, str, str, str, str]] = [
120
+ # (model_id, vendor, gpu, engine, source)
121
+ ("deepseek-ai/DeepSeek-V4-Flash", "NVIDIA", "H800", "vllm", "HuggingFace"),
122
+ ("deepseek-ai/DeepSeek-V3", "NVIDIA", "H800", "vllm", "HuggingFace"),
123
+ ("Qwen/Qwen2.5-72B-Instruct", "NVIDIA", "H100", "vllm", "HuggingFace"),
124
+ ("Qwen/Qwen3-30B-A3B", "NVIDIA", "A100-80G", "vllm", "HuggingFace"),
125
+ ("mistralai/Mixtral-8x7B-v0.1", "NVIDIA", "H100", "vllm", "HuggingFace"),
126
+ ("microsoft/Phi-4", "NVIDIA", "RTX4090", "vllm", "HuggingFace"),
127
+ ("deepseek-ai/DeepSeek-V4-Flash", "Huawei Ascend", "910B4", "vllm", "HuggingFace"),
128
+ # ModelScope examples — same models, China-side mirror.
129
+ ("Qwen/Qwen3-30B-A3B", "NVIDIA", "A100-80G", "vllm", "ModelScope"),
130
+ ("deepseek-ai/DeepSeek-V3", "Huawei Ascend", "910B4", "vllm", "ModelScope"),
131
+ ]
132
+
133
+ # ---------------------------------------------------------------------------
134
+ # Output rendering
135
+
136
+
137
+ def _fmt_bytes(n: int | None) -> str:
138
+ if n is None:
139
+ return "—"
140
+ if n < 1024:
141
+ return f"{n} B"
142
+ f = float(n)
143
+ for u in ["KB", "MB", "GB", "TB"]:
144
+ f /= 1024
145
+ if f < 1024:
146
+ return f"{f:.2f} {u}"
147
+ return f"{f:.2f} PB"
148
+
149
+
150
+ def _fmt_params(n: int | None) -> str:
151
+ if not n:
152
+ return "—"
153
+ if n >= 1_000_000_000:
154
+ return f"{n / 1_000_000_000:.1f}B"
155
+ if n >= 1_000_000:
156
+ return f"{n / 1_000_000:.1f}M"
157
+ return f"{n:,}"
158
+
159
+
160
+ def _label_color(label: str) -> str:
161
+ """Map a provenance label to a CSS color (visible in both light and dark)."""
162
+ return {
163
+ "verified": "#16a34a", # green-600
164
+ "inferred": "#2563eb", # blue-600
165
+ "estimated": "#d97706", # amber-600
166
+ "cited": "#7c3aed", # violet-600
167
+ "unverified": "#9a3412", # orange-800
168
+ "unknown": "#6b7280", # gray-500
169
+ "llm-opinion": "#db2777", # pink-600
170
+ }.get(label, "#6b7280")
171
+
172
+
173
+ def _label_chip(label_key: str) -> str:
174
+ """Render a [label] chip with the right color."""
175
+ color = _label_color(label_key)
176
+ text = t(f"label.{label_key}")
177
+ return (
178
+ f'<span class="lc-chip" style="background:{color}1a;color:{color};'
179
+ f'border:1px solid {color}55">{text}</span>'
180
+ )
181
+
182
+
183
+ def _stat_card(label: str, value: str, sublabel: str = "", chip: str = "") -> str:
184
+ chip_html = f"<div class='lc-stat-chip'>{chip}</div>" if chip else ""
185
+ sub_html = f"<div class='lc-stat-sub'>{sublabel}</div>" if sublabel else ""
186
+ return (
187
+ f"<div class='lc-stat'>"
188
+ f"<div class='lc-stat-value'>{value}</div>"
189
+ f"<div class='lc-stat-label'>{label}</div>"
190
+ f"{sub_html}{chip_html}"
191
+ f"</div>"
192
+ )
193
+
194
+
195
+ def _esc(s: str) -> str:
196
+ return (
197
+ str(s)
198
+ .replace("&", "&amp;")
199
+ .replace("<", "&lt;")
200
+ .replace(">", "&gt;")
201
+ )
202
+
203
+
204
+ def _render(report: EvaluationReport, locale: str) -> str:
205
+ set_locale(locale) # type: ignore[arg-type]
206
+ is_zh = locale == "zh"
207
+
208
+ p, w, r, f = report.profile, report.weight, report.reconciliation, report.fleet
209
+
210
+ # ---- Headline stat cards -------------------------------------------------
211
+ weight_str = _fmt_bytes(w.total_bytes.value)
212
+ weight_chip = _label_chip(w.total_bytes.label.value)
213
+ quant_chip = _label_chip(w.quantization_guess.label.value)
214
+ prod_opt = (
215
+ next((o for o in (f.options if f else []) if o.tier == "prod"), None) if f else None
216
+ )
217
+ prod_gpus = str(prod_opt.gpu_count) if prod_opt else "—"
218
+ prod_concurrent = str(prod_opt.max_concurrent_at_reference_ctx) if prod_opt else "—"
219
+
220
+ headline = (
221
+ f"<div class='lc-header'>"
222
+ f"<div class='lc-title'>{_esc(report.model_id)}</div>"
223
+ f"<div class='lc-subtitle'>"
224
+ f"{_esc(report.gpu)} · {_esc(report.engine)}"
225
+ f"</div></div>"
226
+ f"<div class='lc-stats'>"
227
+ f"{_stat_card('Weight' if not is_zh else '权重', weight_str, sublabel='from safetensors API' if not is_zh else '取自 safetensors API', chip=weight_chip)}"
228
+ f"{_stat_card('Quantization' if not is_zh else '量化', _esc(w.quantization_guess.value), sublabel='resolved scheme' if not is_zh else '已识别方案', chip=quant_chip)}"
229
+ f"{_stat_card('Prod GPUs' if not is_zh else 'Prod GPU 数', prod_gpus, sublabel='for 16-user prod' if not is_zh else '生产档(16 路并发)')}"
230
+ f"{_stat_card('Users @ 128K' if not is_zh else '用户 @ 128K', prod_concurrent, sublabel='concurrent at prod tier' if not is_zh else '生产档的并发')}"
231
+ f"</div>"
232
+ )
233
+
234
+ # Provenance footer for the headline
235
+ quant_source = _esc(w.quantization_guess.source or "")
236
+ headline += f"<div class='lc-prov'>{quant_source}</div>"
237
+
238
+ # ---- Architecture --------------------------------------------------------
239
+ arch_rows: list[tuple[str, str]] = [("model_type", p.model_type)]
240
+ if p.attention:
241
+ arch_rows.append(
242
+ (
243
+ "attention",
244
+ f"{p.attention.variant} (heads={p.attention.num_heads}, "
245
+ f"kv_heads={p.attention.num_kv_heads}, hd={p.attention.head_dim})",
246
+ )
247
+ )
248
+ if p.moe:
249
+ arch_rows.append(
250
+ (
251
+ "moe",
252
+ f"{p.moe.num_routed_experts} routed + "
253
+ f"{p.moe.num_shared_experts} shared, top-{p.moe.num_experts_per_tok}",
254
+ )
255
+ )
256
+ if p.sliding_window:
257
+ arch_rows.append(("sliding_window", str(p.sliding_window)))
258
+
259
+ arch_html = "".join(
260
+ f"<tr><td><code>{_esc(k)}</code></td><td><code>{_esc(v)}</code></td></tr>"
261
+ for k, v in arch_rows
262
+ )
263
+ arch_explainer = (
264
+ "从模型 config.json 读出来的,决定后续所有公式怎么走(是否分组注意力、是否 MoE、是否滑动窗口)。"
265
+ if is_zh
266
+ else "Read straight from the model's config.json. Drives every formula "
267
+ "downstream — attention sharding, MoE active-expert ratio, sliding window."
268
+ )
269
+ arch_section = (
270
+ f"<div class='lc-section'><h3>{'架构' if is_zh else 'Architecture'}</h3>"
271
+ f"<div class='lc-section-help'>{arch_explainer}</div>"
272
+ f"<table class='lc-table'>{arch_html}</table></div>"
273
+ )
274
+
275
+ # ---- Reconciliation ------------------------------------------------------
276
+ recon_rows = []
277
+ for c in r.candidates[:5]:
278
+ is_best = c.scheme == r.best.value
279
+ cls = " class='lc-best'" if is_best else ""
280
+ marker = " ✓" if is_best else ""
281
+ recon_rows.append(
282
+ f"<tr{cls}><td><code>{_esc(c.scheme)}</code>{marker}</td>"
283
+ f"<td>{_fmt_bytes(c.predicted_bytes)}</td>"
284
+ f"<td>{c.relative_error * 100:.1f}%</td></tr>"
285
+ )
286
+ recon_explainer = (
287
+ "用每种量化方案预测应该有多少字节,跟实际 safetensors 字节对比。误差最小的胜出。"
288
+ "FP4_FP8_MIXED / GPTQ_INT4 / AWQ_INT4 在 0.55 bpp 处会打平,需要 config 或 dtype 进一步区分。"
289
+ if is_zh
290
+ else "Predict bytes under each quantization hypothesis, compare against the real "
291
+ "safetensors size. Lowest error wins. FP4_FP8_MIXED / GPTQ_INT4 / AWQ_INT4 tie "
292
+ "at 0.55 bpp — broken via config.json or per-tensor dtype."
293
+ )
294
+ recon_section = (
295
+ f"<div class='lc-section'>"
296
+ f"<h3>{'量化反演' if is_zh else 'Quantization reconciliation'}</h3>"
297
+ f"<div class='lc-section-help'>{recon_explainer}</div>"
298
+ f"<table class='lc-table lc-table-recon'>"
299
+ f"<thead><tr><th>Scheme</th>"
300
+ f"<th>{'预测字节' if is_zh else 'Predicted'}</th>"
301
+ f"<th>{'误差' if is_zh else 'Error'}</th></tr></thead>"
302
+ f"<tbody>{''.join(recon_rows)}</tbody></table></div>"
303
+ )
304
+
305
+ # ---- Fleet ---------------------------------------------------------------
306
+ fleet_section = ""
307
+ if f and f.options:
308
+ # Pick which context lengths get their own concurrency column.
309
+ # Always include 128K if any option has it; also include the model max
310
+ # if it's larger (e.g. 1M for DeepSeek-V4-Flash) so the user can compare
311
+ # "fits 23 users at 128K but only 2 at 1M".
312
+ all_ctxs: set[int] = set()
313
+ for opt in f.options:
314
+ for ctx, _ in opt.max_concurrent_by_context:
315
+ all_ctxs.add(ctx)
316
+ ctx_cols: list[int] = []
317
+ if 131_072 in all_ctxs:
318
+ ctx_cols.append(131_072)
319
+ max_ctx = max(all_ctxs) if all_ctxs else 0
320
+ if max_ctx > 131_072 and max_ctx not in ctx_cols:
321
+ ctx_cols.append(max_ctx)
322
+ if not ctx_cols and all_ctxs:
323
+ ctx_cols.append(max_ctx)
324
+
325
+ def _ctx_label(ctx: int) -> str:
326
+ if ctx >= 1_000_000:
327
+ return f"{ctx // 1_000_000}M" if ctx % 1_000_000 == 0 else f"{ctx / 1_000_000:.1f}M"
328
+ if ctx >= 1024:
329
+ return f"{ctx // 1024}K"
330
+ return str(ctx)
331
+
332
+ rows = []
333
+ for opt in f.options:
334
+ star = " ★" if opt.tier == f.best_tier else ""
335
+ cls = " class='lc-best'" if opt.tier == f.best_tier else ""
336
+ headroom = max(0, opt.usable_bytes_per_gpu - opt.weight_bytes_per_gpu)
337
+ ctx_map = dict(opt.max_concurrent_by_context)
338
+ ctx_cells = "".join(f"<td>{ctx_map.get(c, '—')}</td>" for c in ctx_cols)
339
+ rows.append(
340
+ f"<tr{cls}><td><code>{opt.tier}{star}</code></td>"
341
+ f"<td>{opt.gpu_count}</td>"
342
+ f"<td>{_fmt_bytes(opt.weight_bytes_per_gpu)}</td>"
343
+ f"<td>{_fmt_bytes(headroom)}</td>"
344
+ f"{ctx_cells}</tr>"
345
+ )
346
+
347
+ ctx_headers = "".join(
348
+ f"<th>{('@ ' + _ctx_label(c) + ' 并发') if is_zh else ('Concurrent @ ' + _ctx_label(c))}</th>"
349
+ for c in ctx_cols
350
+ )
351
+ fleet_explainer = (
352
+ "min = 刚好放得下;dev = 8 路并发场景;prod = 16 路并发场景。★ = 推荐。"
353
+ if is_zh
354
+ else "min = barely fits weights; dev = sized for 8 concurrent at 128K; "
355
+ "prod = sized for 16 concurrent at 128K. ★ = recommended."
356
+ )
357
+ fleet_section = (
358
+ f"<div class='lc-section'>"
359
+ f"<h3>{'推荐集群' if is_zh else 'Recommended fleet'}</h3>"
360
+ f"<div class='lc-section-help'>{fleet_explainer}</div>"
361
+ f"<table class='lc-table'>"
362
+ f"<thead><tr><th>Tier</th><th>GPUs</th>"
363
+ f"<th>Weight/GPU</th><th>Headroom/GPU</th>"
364
+ f"{ctx_headers}</tr></thead>"
365
+ f"<tbody>{''.join(rows)}</tbody></table></div>"
366
+ )
367
+
368
+ # ---- Performance ---------------------------------------------------------
369
+ perf_explainer = (
370
+ "Prefill 用算力公式(FLOPs = 2 × 参数 × 输入 token),decode 用带宽公式(吞吐 = 带宽 × 利用率 / 权重字节)。"
371
+ "Bottleneck 标 memory_bandwidth 说明 decode 是带宽瓶颈,加显存带宽更高的 GPU 比加算力更划算。"
372
+ if is_zh
373
+ else "Prefill uses the compute formula (FLOPs = 2 × params × input_tokens, Kaplan 2020). "
374
+ "Decode uses memory-bandwidth formula (tps = BW × util / weight_bytes, vLLM paper). "
375
+ "Bottleneck = memory_bandwidth means a higher-BW GPU helps more than more FLOPS."
376
+ )
377
+ perf_section = ""
378
+ if report.prefill and report.decode and report.concurrency:
379
+ max_users = report.concurrency.max_concurrent.value
380
+ bn = report.concurrency.bottleneck
381
+ items = [
382
+ (
383
+ "Prefill latency" if not is_zh else "Prefill 延迟",
384
+ f"{report.prefill.latency_ms.value:.0f} ms",
385
+ f"@ {report.perf_input_tokens or 2000} input tokens",
386
+ ),
387
+ (
388
+ "Cluster decode" if not is_zh else "集群 decode 吞吐",
389
+ f"{report.decode.cluster_tokens_per_sec.value:.0f} tok/s",
390
+ "",
391
+ ),
392
+ (
393
+ "Max concurrent users" if not is_zh else "最大并发用户",
394
+ str(max_users),
395
+ "",
396
+ ),
397
+ (
398
+ "Bottleneck" if not is_zh else "瓶颈",
399
+ f"<code>{_esc(bn)}</code>",
400
+ "",
401
+ ),
402
+ ]
403
+ items_html = "".join(
404
+ f"<div class='lc-perf-item'>"
405
+ f"<div class='lc-perf-value'>{v}</div>"
406
+ f"<div class='lc-perf-label'>{_esc(label)}</div>"
407
+ f"<div class='lc-perf-sub'>{_esc(sub)}</div></div>"
408
+ for label, v, sub in items
409
+ )
410
+ perf_section = (
411
+ f"<div class='lc-section'>"
412
+ f"<h3>{'性能' if is_zh else 'Performance'}</h3>"
413
+ f"<div class='lc-section-help'>{perf_explainer}</div>"
414
+ f"<div class='lc-perf'>{items_html}</div></div>"
415
+ )
416
+
417
+ # ---- KV cache per request -----------------------------------------------
418
+ kv_section = ""
419
+ if report.kv_cache_by_context:
420
+ rows = []
421
+ for ctx, av in sorted(report.kv_cache_by_context.items()):
422
+ rows.append(
423
+ f"<tr><td>{ctx:,}</td><td>{_fmt_bytes(av.value)}</td>"
424
+ f"<td>{_label_chip(av.label.value)}</td></tr>"
425
+ )
426
+ kv_explainer = (
427
+ "单个请求在不同 context 长度下需要多少 KV 缓存。这是决定一张 GPU 能并发跑多少请求的关键。"
428
+ "MLA / MQA 模型这里会比标准 GQA 小很多。"
429
+ if is_zh
430
+ else "How much KV cache one request consumes at each context length. "
431
+ "This is what limits per-GPU concurrency. MLA / MQA models are "
432
+ "dramatically smaller here than standard GQA."
433
+ )
434
+ kv_section = (
435
+ f"<div class='lc-section'>"
436
+ f"<h3>{'KV 缓存(每请求)' if is_zh else 'KV cache per request'}</h3>"
437
+ f"<div class='lc-section-help'>{kv_explainer}</div>"
438
+ f"<table class='lc-table lc-table-recon'>"
439
+ f"<thead><tr><th>{'Context tokens' if not is_zh else 'Context 长度'}</th>"
440
+ f"<th>{'KV bytes' if not is_zh else 'KV 字节'}</th>"
441
+ f"<th>{'Label' if not is_zh else '标签'}</th></tr></thead>"
442
+ f"<tbody>{''.join(rows)}</tbody></table></div>"
443
+ )
444
+
445
+ # ---- Engine compatibility -----------------------------------------------
446
+ engine_section = ""
447
+ em = report.engine_match
448
+ if em:
449
+ def _fmt_flag(f) -> str: # noqa: ANN001
450
+ base = f"{f.flag} {f.value}".strip()
451
+ return base
452
+ flags = ", ".join(_fmt_flag(f) for f in em.required_flags) if em.required_flags else "—"
453
+ opt_flags = ", ".join(_fmt_flag(f) for f in em.optional_flags) if em.optional_flags else "—"
454
+ caveats = em.caveats_zh if is_zh else em.caveats_en
455
+ sources_html = "—"
456
+ if em.sources:
457
+ sources_html = "<br>".join(
458
+ f'<a href="{_esc(s.url)}" target="_blank" rel="noopener">{_esc(s.url)}</a>'
459
+ + (
460
+ f" <span class='lc-prov'>({_esc(s.captured_date)})</span>"
461
+ if s.captured_date
462
+ else ""
463
+ )
464
+ for s in em.sources
465
+ )
466
+ rows = [
467
+ (("引擎" if is_zh else "Engine"), f"<code>{_esc(em.engine)}</code>"),
468
+ (
469
+ ("版本要求" if is_zh else "Version"),
470
+ f"<code>{_esc(em.version_spec)}</code>",
471
+ ),
472
+ (
473
+ ("支持级别" if is_zh else "Support"),
474
+ _label_chip(em.support) if em.support in {"verified", "cited", "unverified"} else f"<code>{_esc(em.support)}</code>",
475
+ ),
476
+ (
477
+ ("验证级别" if is_zh else "Verification"),
478
+ _label_chip(em.verification_level),
479
+ ),
480
+ (("必需 flag" if is_zh else "Required flags"), f"<code>{_esc(flags)}</code>"),
481
+ (("可选 flag" if is_zh else "Optional flags"), f"<code>{_esc(opt_flags)}</code>"),
482
+ ]
483
+ if caveats:
484
+ rows.append((("注意事项" if is_zh else "Caveats"), _esc(caveats)))
485
+ rows.append((("来源" if is_zh else "Sources"), sources_html))
486
+ body = "".join(f"<tr><td>{k}</td><td>{v}</td></tr>" for k, v in rows)
487
+ engine_explainer = (
488
+ "这个模型在 vLLM/SGLang 哪个版本起能跑、需要哪些必需 flag、有哪些优化 flag。"
489
+ "verification_level 标 cited 表示从 PR / release note 引用,verified 表示实测过。"
490
+ if is_zh
491
+ else "Which engine version supports this model, what flags are required, "
492
+ "and which optional flags help. verification_level=cited means we got it "
493
+ "from a PR or release note; verified means we actually ran it."
494
+ )
495
+ engine_section = (
496
+ f"<div class='lc-section'>"
497
+ f"<h3>{'引擎兼容性' if is_zh else 'Engine compatibility'}</h3>"
498
+ f"<div class='lc-section-help'>{engine_explainer}</div>"
499
+ f"<table class='lc-table'>{body}</table></div>"
500
+ )
501
+
502
+ # ---- GPU spec ------------------------------------------------------------
503
+ gpu_section = ""
504
+ g = report.gpu_spec
505
+ if g:
506
+ notes = g.notes_zh if is_zh else g.notes_en
507
+ rows = [
508
+ ("HBM", f"{g.memory_gb} GB"),
509
+ ("Memory BW", f"{g.memory_bandwidth_gbps or '—'} GB/s"),
510
+ ("NVLink BW", f"{g.nvlink_bandwidth_gbps} GB/s"),
511
+ ("FP16 TFLOPS", f"{g.fp16_tflops}"),
512
+ ("FP8", "✓" if g.fp8_support else "—"),
513
+ ("FP4", "✓" if g.fp4_support else "—"),
514
+ ]
515
+ rows_html = "".join(
516
+ f"<tr><td>{_esc(k)}</td><td><code>{_esc(v)}</code></td></tr>"
517
+ for k, v in rows
518
+ )
519
+ notes_html = (
520
+ f"<div class='lc-prov' style='margin-top:8px'>{_esc(notes)}</div>" if notes else ""
521
+ )
522
+ source_html = (
523
+ f"<div class='lc-prov'>{'来源' if is_zh else 'Source'}: "
524
+ f"<a href='{_esc(g.spec_source)}' target='_blank' rel='noopener'>"
525
+ f"{_esc(g.spec_source)}</a></div>"
526
+ if g.spec_source and g.spec_source.startswith("http")
527
+ else (f"<div class='lc-prov'>{_esc(g.spec_source)}</div>" if g.spec_source else "")
528
+ )
529
+ gpu_explainer = (
530
+ "目标 GPU 的硬件规格。Memory BW 决定 decode 能跑多快,FP8/FP4 支持决定能用什么量化。"
531
+ if is_zh
532
+ else "Hardware spec of the chosen GPU. Memory BW caps decode throughput; "
533
+ "FP8/FP4 support determines which quantization paths actually accelerate."
534
+ )
535
+ gpu_section = (
536
+ f"<div class='lc-section'>"
537
+ f"<h3>{'目标 GPU 规格' if is_zh else 'Target GPU spec'} — <code>{_esc(g.id)}</code></h3>"
538
+ f"<div class='lc-section-help'>{gpu_explainer}</div>"
539
+ f"<table class='lc-table'>{rows_html}</table>"
540
+ f"{notes_html}{source_html}"
541
+ f"</div>"
542
+ )
543
+
544
+ # ---- Generated command ---------------------------------------------------
545
+ cmd_section = ""
546
+ if report.generated_command:
547
+ cmd_explainer = (
548
+ "可以直接复制粘贴到带显卡的机器上跑。flag 是按推荐 tier 的 GPU 数 + 引擎兼容矩阵的必需 flag 自动拼的。"
549
+ if is_zh
550
+ else "Copy-pasteable on a machine with the right GPUs. Flags auto-assembled "
551
+ "from the recommended fleet tier + engine compat matrix's required flags."
552
+ )
553
+ cmd_section = (
554
+ f"<div class='lc-section'>"
555
+ f"<h3>{'生成命令' if is_zh else 'Generated command'}</h3>"
556
+ f"<div class='lc-section-help'>{cmd_explainer}</div>"
557
+ f"<pre class='lc-cmd'><code>{_esc(report.generated_command)}</code></pre></div>"
558
+ )
559
+
560
+ return (
561
+ "<div class='lc-result'>"
562
+ + headline
563
+ + arch_section
564
+ + gpu_section
565
+ + recon_section
566
+ + kv_section
567
+ + fleet_section
568
+ + perf_section
569
+ + engine_section
570
+ + cmd_section
571
+ + _render_star_cta(is_zh)
572
+ + "</div>"
573
+ )
574
+
575
+
576
+ def _render_compare(reports: list[EvaluationReport], locale: str) -> str:
577
+ """Side-by-side comparison of N >= 2 reports for the same model on
578
+ different GPUs.
579
+
580
+ Each metric column declares whether higher or lower is better and we
581
+ paint the winner cell in green so the eye snaps to it.
582
+ """
583
+ set_locale(locale) # type: ignore[arg-type]
584
+ is_zh = locale == "zh"
585
+
586
+ # All reports share the same model_id + engine — pull from the first.
587
+ head = reports[0]
588
+ title = (
589
+ f"<div class='lc-header'>"
590
+ f"<div class='lc-title'>{_esc(head.model_id)}</div>"
591
+ f"<div class='lc-subtitle'>"
592
+ f"{('对比 ' + str(len(reports)) + ' 张 GPU') if is_zh else ('Comparing ' + str(len(reports)) + ' GPUs')}"
593
+ f" · {_esc(head.engine)}"
594
+ f"</div></div>"
595
+ )
596
+
597
+ # Metric definitions: (label_en, label_zh, value_fn, better=lower|higher|info, formatter)
598
+ # "info" rows are not contested — used for model-determined facts (same across
599
+ # GPUs by construction) or for descriptive cells like Bottleneck.
600
+ def _max_concurrent(r: EvaluationReport) -> int | None:
601
+ if not r.fleet:
602
+ return None
603
+ prod = next((o for o in r.fleet.options if o.tier == "prod"), None)
604
+ return prod.max_concurrent_at_reference_ctx if prod else None
605
+
606
+ def _prod_gpu_count(r: EvaluationReport) -> int | None:
607
+ if not r.fleet:
608
+ return None
609
+ prod = next((o for o in r.fleet.options if o.tier == "prod"), None)
610
+ return prod.gpu_count if prod else None
611
+
612
+ def _kv_per_user_128k(r: EvaluationReport) -> int | None:
613
+ av = r.kv_cache_by_context.get(131072)
614
+ return av.value if av is not None else None
615
+
616
+ def _native_precision_score(r: EvaluationReport) -> int | None:
617
+ g = r.gpu_spec
618
+ if g is None:
619
+ return None
620
+ return (1 if g.fp8_support else 0) + (1 if g.fp4_support else 0)
621
+
622
+ def _fmt_native(v: int | None) -> str:
623
+ if v is None:
624
+ return "—"
625
+ return {0: "FP16 only", 1: "FP8", 2: "FP8 + FP4"}.get(v, str(v))
626
+
627
+ def _max_context_tokens(r: EvaluationReport) -> int | None:
628
+ """Effective max context the model claims to support.
629
+
630
+ In modern HF configs (LLaMA 3+, DeepSeek V3+, Qwen2.5+), the field
631
+ max_position_embeddings already reflects the post-RoPE/YaRN-scaling
632
+ window. rope_scaling_factor is recorded for provenance but must NOT
633
+ be multiplied in again — that double-counts.
634
+ """
635
+ pos = r.profile.position
636
+ if pos is None or pos.max_position_embeddings is None:
637
+ return None
638
+ return int(pos.max_position_embeddings)
639
+
640
+ def _fmt_context(v: int | None) -> str:
641
+ """Binary-base formatting so 131072 reads as '128K' not '131K'."""
642
+ if v is None:
643
+ return "—"
644
+ if v >= 1024 * 1024:
645
+ return f"{v / (1024 * 1024):.1f}M".replace(".0M", "M")
646
+ if v >= 1024:
647
+ return f"{v // 1024}K"
648
+ return str(v)
649
+
650
+ def _cluster_qps(r: EvaluationReport) -> float | None:
651
+ """Steady-state queries/sec the cluster sustains:
652
+ QPS = cluster_decode_tokens_per_sec / output_tokens_per_request."""
653
+ if not r.decode or r.decode.cluster_tokens_per_sec.value <= 0:
654
+ return None
655
+ out = r.perf_output_tokens or 512
656
+ if out <= 0:
657
+ return None
658
+ return r.decode.cluster_tokens_per_sec.value / out
659
+
660
+ metrics = [
661
+ # ── Model-determined rows (info; identical across GPUs by definition) ──
662
+ ("Quantization", "量化方案",
663
+ lambda r: r.weight.quantization_guess.value, "info",
664
+ lambda v: _esc(str(v)) if v else "—"),
665
+ ("Weights total", "权重总量",
666
+ lambda r: r.weight.total_bytes.value, "info",
667
+ lambda v: _fmt_bytes(v) if v else "—"),
668
+ ("KV / user @ 128K", "KV / 用户 @ 128K",
669
+ _kv_per_user_128k, "info",
670
+ lambda v: _fmt_bytes(v) if v is not None else "—"),
671
+ ("Max context", "最大上下文",
672
+ _max_context_tokens, "info",
673
+ _fmt_context),
674
+ # ── GPU hardware specs (contested) ──
675
+ ("HBM / card", "单卡显存",
676
+ lambda r: r.gpu_spec.memory_gb if r.gpu_spec else None, "higher",
677
+ lambda v: f"{v} GB" if v is not None else "—"),
678
+ ("HBM bandwidth", "显存带宽",
679
+ lambda r: r.gpu_spec.memory_bandwidth_gbps if r.gpu_spec else None, "higher",
680
+ lambda v: f"{v:,} GB/s" if v is not None else "—"),
681
+ ("NVLink / card", "NVLink 带宽",
682
+ lambda r: r.gpu_spec.nvlink_bandwidth_gbps if r.gpu_spec else None, "higher",
683
+ lambda v: (f"{v} GB/s" if v else "无") if v is not None else "—"),
684
+ ("Native FP8/FP4", "原生低精度",
685
+ _native_precision_score, "higher",
686
+ _fmt_native),
687
+ # ── Sizing & performance outcomes (contested) ──
688
+ ("Prod GPUs", "生产档 GPU 数",
689
+ _prod_gpu_count, "lower",
690
+ lambda v: str(v) if v is not None else "—"),
691
+ ("Users @ 128K", "用户 @ 128K",
692
+ _max_concurrent, "higher",
693
+ lambda v: str(v) if v is not None else "—"),
694
+ ("Prefill latency", "Prefill 延迟",
695
+ lambda r: r.prefill.latency_ms.value if r.prefill else None, "lower",
696
+ lambda v: f"{v:.0f} ms" if v is not None else "—"),
697
+ ("Per-GPU decode", "单卡 decode 吞吐",
698
+ lambda r: r.decode.per_gpu_tokens_per_sec.value if r.decode else None, "higher",
699
+ lambda v: f"{v:.0f} tok/s" if v is not None else "—"),
700
+ ("Cluster decode", "集群 decode 吞吐",
701
+ lambda r: r.decode.cluster_tokens_per_sec.value if r.decode else None, "higher",
702
+ lambda v: f"{v:.0f} tok/s" if v is not None else "—"),
703
+ ("Sustained QPS", "稳态 QPS",
704
+ _cluster_qps, "higher",
705
+ lambda v: f"{v:.2f} q/s" if v is not None else "—"),
706
+ # ── Diagnostic (info — string, not a number race) ──
707
+ ("Bottleneck", "瓶颈",
708
+ lambda r: r.concurrency.bottleneck if r.concurrency else None, "info",
709
+ lambda v: f"<code>{_esc(str(v))}</code>" if v else "—"),
710
+ ]
711
+
712
+ # GPU column headers
713
+ gpu_headers = "".join(
714
+ f"<th class='lc-cmp-gpu'>{_esc(r.gpu)}</th>" for r in reports
715
+ )
716
+
717
+ rows_html = []
718
+ for label_en, label_zh, getter, better, fmt in metrics:
719
+ values = [getter(r) for r in reports]
720
+
721
+ # Pick the winning index. None values are excluded from the contest.
722
+ winner_idx: int | None = None
723
+ if better in ("higher", "lower"):
724
+ numeric_pairs = [(i, v) for i, v in enumerate(values) if isinstance(v, (int, float))]
725
+ if numeric_pairs:
726
+ if better == "higher":
727
+ winner_idx = max(numeric_pairs, key=lambda p: p[1])[0]
728
+ else:
729
+ winner_idx = min(numeric_pairs, key=lambda p: p[1])[0]
730
+ # If all values are equal, no winner (avoid arbitrary-tiebreak gold star)
731
+ vals_set = {v for _, v in numeric_pairs}
732
+ if len(vals_set) <= 1:
733
+ winner_idx = None
734
+
735
+ cells = []
736
+ for i, v in enumerate(values):
737
+ cls = " class='lc-cmp-winner'" if i == winner_idx else ""
738
+ cells.append(f"<td{cls}>{fmt(v)}</td>")
739
+
740
+ label = label_zh if is_zh else label_en
741
+ # Tag info rows so the eye knows "this is a model fact, not a contest".
742
+ is_info = better == "info"
743
+ label_cls = "lc-cmp-row-label lc-cmp-row-info" if is_info else "lc-cmp-row-label"
744
+ tr_cls = " class='lc-cmp-tr-info'" if is_info else ""
745
+ rows_html.append(
746
+ f"<tr{tr_cls}><th class='{label_cls}'>{_esc(label)}</th>{''.join(cells)}</tr>"
747
+ )
748
+
749
+ # Aggregate winner — count column wins across "higher/lower" metrics
750
+ win_counts = [0] * len(reports)
751
+ for label_en, label_zh, getter, better, fmt in metrics:
752
+ if better == "info":
753
+ continue
754
+ values = [getter(r) for r in reports]
755
+ numeric_pairs = [(i, v) for i, v in enumerate(values) if isinstance(v, (int, float))]
756
+ if not numeric_pairs:
757
+ continue
758
+ vals_set = {v for _, v in numeric_pairs}
759
+ if len(vals_set) <= 1:
760
+ continue
761
+ if better == "higher":
762
+ winner_idx = max(numeric_pairs, key=lambda p: p[1])[0]
763
+ else:
764
+ winner_idx = min(numeric_pairs, key=lambda p: p[1])[0]
765
+ win_counts[winner_idx] += 1
766
+
767
+ overall_text = ""
768
+ if any(win_counts):
769
+ max_wins = max(win_counts)
770
+ leaders = [reports[i].gpu for i, c in enumerate(win_counts) if c == max_wins]
771
+ if len(leaders) == 1:
772
+ overall_text = (
773
+ f"<div class='lc-cmp-summary'>"
774
+ f"{'综合最优' if is_zh else 'Overall winner'}: "
775
+ f"<strong>{_esc(leaders[0])}</strong> "
776
+ f"({max_wins}/{sum(1 for m in metrics if m[3] != 'info')} "
777
+ f"{'指标领先' if is_zh else 'metrics lead'})"
778
+ f"</div>"
779
+ )
780
+ else:
781
+ overall_text = (
782
+ f"<div class='lc-cmp-summary'>"
783
+ f"{'势均力敌' if is_zh else 'Tied'}: "
784
+ f"<strong>{_esc(' / '.join(leaders))}</strong>"
785
+ f"</div>"
786
+ )
787
+
788
+ table = (
789
+ f"<div class='lc-section'>"
790
+ f"<h3>{'对比' if is_zh else 'Side-by-side comparison'}</h3>"
791
+ f"<div class='lc-cmp-wrap'>"
792
+ f"<table class='lc-cmp-table'>"
793
+ f"<thead><tr>"
794
+ f"<th class='lc-cmp-row-label'></th>"
795
+ f"{gpu_headers}"
796
+ f"</tr></thead>"
797
+ f"<tbody>{''.join(rows_html)}</tbody>"
798
+ f"</table></div>"
799
+ f"{overall_text}"
800
+ f"</div>"
801
+ )
802
+
803
+ # Per-GPU detail headlines (small stat cards) below the table
804
+ detail_blocks = []
805
+ for r in reports:
806
+ weight_str = _fmt_bytes(r.weight.total_bytes.value)
807
+ prod = _prod_gpu_count(r)
808
+ users = _max_concurrent(r)
809
+ detail_blocks.append(
810
+ f"<div class='lc-cmp-detail'>"
811
+ f"<div class='lc-cmp-detail-gpu'>{_esc(r.gpu)}</div>"
812
+ f"<div class='lc-cmp-detail-row'>"
813
+ f"<span>{'权重' if is_zh else 'Weight'}</span><strong>{weight_str}</strong></div>"
814
+ f"<div class='lc-cmp-detail-row'>"
815
+ f"<span>{'生产 GPU' if is_zh else 'Prod GPUs'}</span>"
816
+ f"<strong>{prod if prod is not None else '—'}</strong></div>"
817
+ f"<div class='lc-cmp-detail-row'>"
818
+ f"<span>{'用户 @ 128K' if is_zh else 'Users @ 128K'}</span>"
819
+ f"<strong>{users if users is not None else '—'}</strong></div>"
820
+ f"</div>"
821
+ )
822
+ detail_section = (
823
+ f"<div class='lc-section'>"
824
+ f"<h3>{'各档详情' if is_zh else 'Per-GPU detail'}</h3>"
825
+ f"<div class='lc-cmp-details'>{''.join(detail_blocks)}</div>"
826
+ f"</div>"
827
+ )
828
+
829
+ return (
830
+ "<div class='lc-result'>"
831
+ + title
832
+ + table
833
+ + detail_section
834
+ + _render_star_cta(is_zh)
835
+ + "</div>"
836
+ )
837
+
838
+
839
+ def _render_star_cta(is_zh: bool) -> str:
840
+ """Tail-of-result CTA — shown right after the user got their answer,
841
+ which is when satisfaction is highest and the GitHub star ask reads as
842
+ 'thanks for the tool' rather than 'please give me attention'."""
843
+ en_msg = "Saved you GPU-sizing math?"
844
+ zh_msg = "省了你 GPU 选型的时间?"
845
+ cta_en = "Star on GitHub"
846
+ cta_zh = "给个 Star"
847
+ text_top = zh_msg if is_zh else en_msg
848
+ text_bottom = en_msg if is_zh else zh_msg
849
+ cta = f"{cta_zh if is_zh else cta_en} · {cta_en if is_zh else cta_zh}"
850
+ return (
851
+ "<a class='lc-star-cta' href='https://github.com/FlyTOmeLight/llm-cal' "
852
+ "target='_blank' rel='noopener'>"
853
+ "<svg viewBox='0 0 16 16' width='18' height='18' aria-hidden='true' fill='currentColor'>"
854
+ "<path d='M8 0C3.58 0 0 3.58 0 8a8 8 0 0 0 5.47 7.59c.4.07.55-.17.55-.38v-1.33c-2.22.48-2.69-1.07-2.69-1.07-.36-.92-.89-1.17-.89-1.17-.73-.5.06-.49.06-.49.81.06 1.23.83 1.23.83.72 1.23 1.88.87 2.34.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.83-2.15-.08-.2-.36-1.02.08-2.13 0 0 .67-.21 2.2.82a7.6 7.6 0 0 1 4 0c1.53-1.04 2.2-.82 2.2-.82.44 1.11.16 1.93.08 2.13.51.56.83 1.27.83 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48v2.19c0 .21.15.46.55.38A8 8 0 0 0 16 8c0-4.42-3.58-8-8-8z'/>"
855
+ "</svg>"
856
+ f"<div class='lc-star-cta-text'>"
857
+ f"<div class='lc-star-cta-q'>{text_top}</div>"
858
+ f"<div class='lc-star-cta-q-en'>{text_bottom}</div>"
859
+ f"</div>"
860
+ f"<div class='lc-star-cta-action'>{cta} →</div>"
861
+ "</a>"
862
+ )
863
+
864
+
865
+ def _render_explain(entries: list[ExplainEntry], is_zh: bool) -> str:
866
+ """Render --explain derivation trace as an HTML accordion."""
867
+ if not entries:
868
+ return ""
869
+ blocks = []
870
+ for e in entries:
871
+ inputs_html = ""
872
+ if e.inputs:
873
+ inputs_html = "<ul class='lc-explain-inputs'>" + "".join(
874
+ f"<li><code>{_esc(inp.name)}</code> = "
875
+ f"<strong>{_esc(inp.value)}</strong> "
876
+ f"<span class='lc-explain-label'>{_esc(inp.label)}</span>"
877
+ + (f" — <em>{_esc(inp.note)}</em>" if inp.note else "")
878
+ + "</li>"
879
+ for inp in e.inputs
880
+ ) + "</ul>"
881
+ steps_html = ""
882
+ if e.steps:
883
+ steps_html = "<ol class='lc-explain-steps'>" + "".join(
884
+ f"<li>{_esc(s)}</li>" for s in e.steps
885
+ ) + "</ol>"
886
+ source_html = (
887
+ f"<div class='lc-prov'>{'来源' if is_zh else 'Source'}: {_esc(e.source)}</div>"
888
+ if e.source
889
+ else ""
890
+ )
891
+ blocks.append(
892
+ f"<div class='lc-explain-entry'>"
893
+ f"<div class='lc-explain-heading'>{_esc(e.heading)}</div>"
894
+ f"<div class='lc-explain-formula'><code>{_esc(e.formula)}</code></div>"
895
+ f"{inputs_html}{steps_html}"
896
+ f"<div class='lc-explain-result'>"
897
+ f"{'结果' if is_zh else 'Result'}: <strong>{_esc(e.result)}</strong></div>"
898
+ f"{source_html}"
899
+ f"</div>"
900
+ )
901
+ return (
902
+ "<div class='lc-result'>"
903
+ f"<div class='lc-section'>"
904
+ f"<h3>{'推导链 (--explain)' if is_zh else 'Derivation trace (--explain)'}</h3>"
905
+ + "".join(blocks)
906
+ + "</div></div>"
907
+ )
908
+
909
+
910
+ def _render_llm_review(content: str | None, error: str | None, model: str, is_zh: bool) -> str:
911
+ if error:
912
+ return _render_error(f"LLM review: {error}", is_zh)
913
+ if not content:
914
+ return ""
915
+ # The LLM responds with markdown — convert to a simple HTML block for display.
916
+ # gr.HTML doesn't run markdown, but the LLM's headers (## ...) still read OK as text.
917
+ safe = _esc(content).replace("\n", "<br>")
918
+ return (
919
+ "<div class='lc-result'>"
920
+ f"<div class='lc-section'>"
921
+ f"<h3>{'LLM 审计 (--llm-review)' if is_zh else 'LLM review (--llm-review)'} "
922
+ f"<span class='lc-llm-model'>{_esc(model)}</span></h3>"
923
+ f"<div class='lc-llm-banner'>"
924
+ f"{_label_chip('llm-opinion')} "
925
+ f"{'仅供参考,不覆盖前 6 个 label' if is_zh else 'Second opinion — never overrides the 6 primary labels'}"
926
+ f"</div>"
927
+ f"<div class='lc-llm-content'>{safe}</div>"
928
+ f"</div></div>"
929
+ )
930
+
931
+
932
+ def _render_error(msg: str, is_zh: bool) -> str:
933
+ label = "出错了" if is_zh else "Error"
934
+ return (
935
+ f"<div class='lc-result lc-error'>"
936
+ f"<h3>{label}</h3>"
937
+ f"<pre>{_esc(msg)}</pre></div>"
938
+ )
939
+
940
+
941
+ def _render_loading(is_zh: bool) -> str:
942
+ msg = (
943
+ "正在拉取模型元数据 + 读 safetensors header… 首次大模型约 3-8 秒"
944
+ if is_zh
945
+ else "Fetching model metadata + reading safetensors header… "
946
+ "first lookup of a large model takes 3-8 seconds"
947
+ )
948
+ return (
949
+ "<div class='lc-result lc-loading'>"
950
+ "<div class='lc-spinner'></div>"
951
+ f"<div class='lc-loading-text'>{msg}</div>"
952
+ "</div>"
953
+ )
954
+
955
+
956
+ # ---------------------------------------------------------------------------
957
+ # Backend handler
958
+
959
+ _evaluators: dict[str, Evaluator] = {}
960
+
961
+
962
+ def _get_evaluator(source_key: str) -> Evaluator:
963
+ """One evaluator per source — Evaluator caches an HfApi client internally
964
+ so we don't want to rebuild it every keystroke."""
965
+ if source_key not in _evaluators:
966
+ if source_key == "modelscope":
967
+ _evaluators[source_key] = Evaluator(source=ModelScopeSource())
968
+ else:
969
+ _evaluators[source_key] = Evaluator(source=HuggingFaceSource())
970
+ return _evaluators[source_key]
971
+
972
+
973
+ def calculate(
974
+ model_id: str,
975
+ gpu, # list[str] from multiselect; str also tolerated # noqa: ANN001
976
+ engine: str,
977
+ context_length: int | None,
978
+ lang: str,
979
+ source: str,
980
+ gpu_count: int | None,
981
+ input_tokens: int,
982
+ output_tokens: int,
983
+ target_tps: float,
984
+ prefill_util: float,
985
+ decode_bw_util: float,
986
+ concurrency_degradation: float,
987
+ refresh: bool,
988
+ explain: bool,
989
+ llm_review: bool,
990
+ hf_token: str,
991
+ ms_token: str,
992
+ llm_api_key: str,
993
+ llm_base_url: str,
994
+ llm_model: str,
995
+ ) -> tuple[str, str, str]:
996
+ """Returns (main_html, explain_html, llm_review_html)."""
997
+ locale = "zh" if lang.startswith("中") else "en"
998
+ is_zh = locale == "zh"
999
+
1000
+ # Normalize GPU input. Multiselect returns list; defensive coerce for safety.
1001
+ if isinstance(gpu, str):
1002
+ gpu_list = [gpu] if gpu else []
1003
+ elif isinstance(gpu, (list, tuple)):
1004
+ gpu_list = [g for g in gpu if g]
1005
+ else:
1006
+ gpu_list = []
1007
+
1008
+ if not model_id or not model_id.strip():
1009
+ return (
1010
+ _render_error(
1011
+ "请输入模型 ID" if is_zh else "Enter a model id",
1012
+ is_zh,
1013
+ ),
1014
+ "",
1015
+ "",
1016
+ )
1017
+ if not gpu_list:
1018
+ return (_render_error("请选择 GPU" if is_zh else "Pick a GPU", is_zh), "", "")
1019
+
1020
+ is_compare = len(gpu_list) >= 2
1021
+
1022
+ # Resolve source key. The radio shows e.g. "HuggingFace" / "ModelScope".
1023
+ src_key = "modelscope" if "modelscope" in source.lower() else "huggingface"
1024
+
1025
+ # Inject user-provided tokens into env for the duration of this call only.
1026
+ # We restore the prior values in the finally block so a token entered for
1027
+ # one model doesn't leak into the next request from a different user.
1028
+ token_env_keys = (
1029
+ "HF_TOKEN",
1030
+ "HUGGING_FACE_HUB_TOKEN",
1031
+ "MODELSCOPE_API_TOKEN",
1032
+ "MODELSCOPE_TOKEN",
1033
+ )
1034
+ old_token_env = {k: os.environ.get(k) for k in token_env_keys}
1035
+ if hf_token and hf_token.strip():
1036
+ os.environ["HF_TOKEN"] = hf_token.strip()
1037
+ if ms_token and ms_token.strip():
1038
+ os.environ["MODELSCOPE_API_TOKEN"] = ms_token.strip()
1039
+
1040
+ def _eval_one(g: str) -> EvaluationReport:
1041
+ return _get_evaluator(src_key).evaluate(
1042
+ model_id=model_id.strip(),
1043
+ gpu=g,
1044
+ engine=engine,
1045
+ gpu_count=gpu_count if gpu_count and gpu_count > 0 else None,
1046
+ context_length=context_length if context_length and context_length > 0 else None,
1047
+ refresh=refresh,
1048
+ input_tokens=int(input_tokens) if input_tokens else 2000,
1049
+ output_tokens=int(output_tokens) if output_tokens else 512,
1050
+ target_tokens_per_sec=float(target_tps) if target_tps else 30.0,
1051
+ prefill_utilization=float(prefill_util) if prefill_util else 0.40,
1052
+ decode_bw_utilization=float(decode_bw_util) if decode_bw_util else 0.50,
1053
+ concurrency_degradation=(
1054
+ float(concurrency_degradation) if concurrency_degradation else 1.0
1055
+ ),
1056
+ )
1057
+
1058
+ try:
1059
+ # ---- Compare path: 2-4 GPUs --------------------------------------
1060
+ if is_compare:
1061
+ try:
1062
+ reports = [_eval_one(g) for g in gpu_list]
1063
+ except Exception as e: # noqa: BLE001
1064
+ return (_render_error(f"{type(e).__name__}: {e}", is_zh), "", "")
1065
+ return _render_compare(reports, locale), "", ""
1066
+
1067
+ # ---- Single-GPU path (existing flow) ------------------------------
1068
+ try:
1069
+ report = _eval_one(gpu_list[0])
1070
+ except Exception as e: # noqa: BLE001
1071
+ return (_render_error(f"{type(e).__name__}: {e}", is_zh), "", "")
1072
+
1073
+ main_html = _render(report, locale)
1074
+ explain_html = ""
1075
+ llm_html = ""
1076
+
1077
+ if explain or llm_review:
1078
+ entries = build_explain(report)
1079
+ if explain:
1080
+ explain_html = _render_explain(entries, is_zh)
1081
+ if llm_review:
1082
+ # Only set env vars if user actually provided them — never persist
1083
+ # them in env beyond this call's scope (they live in process env
1084
+ # for the duration of the call, but we don't persist to disk).
1085
+ old_env = {
1086
+ "LLM_CAL_REVIEWER_API_KEY": os.environ.get("LLM_CAL_REVIEWER_API_KEY"),
1087
+ "LLM_CAL_REVIEWER_BASE_URL": os.environ.get("LLM_CAL_REVIEWER_BASE_URL"),
1088
+ "LLM_CAL_REVIEWER_MODEL": os.environ.get("LLM_CAL_REVIEWER_MODEL"),
1089
+ }
1090
+ try:
1091
+ if llm_api_key.strip():
1092
+ os.environ["LLM_CAL_REVIEWER_API_KEY"] = llm_api_key.strip()
1093
+ if llm_base_url.strip():
1094
+ os.environ["LLM_CAL_REVIEWER_BASE_URL"] = llm_base_url.strip()
1095
+ if llm_model.strip():
1096
+ os.environ["LLM_CAL_REVIEWER_MODEL"] = llm_model.strip()
1097
+ result = run_review(entries, locale=locale) # type: ignore[arg-type]
1098
+ finally:
1099
+ for k, v in old_env.items():
1100
+ if v is None:
1101
+ os.environ.pop(k, None)
1102
+ else:
1103
+ os.environ[k] = v
1104
+ llm_html = _render_llm_review(result.content, result.error, result.model, is_zh)
1105
+
1106
+ return main_html, explain_html, llm_html
1107
+ finally:
1108
+ for k, v in old_token_env.items():
1109
+ if v is None:
1110
+ os.environ.pop(k, None)
1111
+ else:
1112
+ os.environ[k] = v
1113
+
1114
+
1115
+ def show_loading(lang: str) -> tuple[str, str, str]:
1116
+ is_zh = lang.startswith("中")
1117
+ return _render_loading(is_zh), "", ""
1118
+
1119
+
1120
+ # ---------------------------------------------------------------------------
1121
+ # UI
1122
+
1123
+ THEME = gr.themes.Soft(primary_hue="indigo")
1124
+
1125
+ HERO_HTML = """
1126
+ <div class='lc-hero'>
1127
+ <div class='lc-hero-top'>
1128
+ <div class='lc-hero-titleblock'>
1129
+ <div class='lc-hero-title'>llm-cal</div>
1130
+ <div class='lc-hero-tagline'>
1131
+ LLM inference hardware calculator · 大模型推理硬件计算器<br>
1132
+ Architecture-aware · Engine-aware · <strong>Honest-labeled</strong>
1133
+ </div>
1134
+ </div>
1135
+ <a class='lc-hero-gh' href='https://github.com/FlyTOmeLight/llm-cal' target='_blank' rel='noopener'>
1136
+ <svg viewBox='0 0 16 16' width='16' height='16' aria-hidden='true' fill='currentColor'>
1137
+ <path d='M8 0C3.58 0 0 3.58 0 8a8 8 0 0 0 5.47 7.59c.4.07.55-.17.55-.38v-1.33c-2.22.48-2.69-1.07-2.69-1.07-.36-.92-.89-1.17-.89-1.17-.73-.5.06-.49.06-.49.81.06 1.23.83 1.23.83.72 1.23 1.88.87 2.34.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.83-2.15-.08-.2-.36-1.02.08-2.13 0 0 .67-.21 2.2.82a7.6 7.6 0 0 1 4 0c1.53-1.04 2.2-.82 2.2-.82.44 1.11.16 1.93.08 2.13.51.56.83 1.27.83 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48v2.19c0 .21.15.46.55.38A8 8 0 0 0 16 8c0-4.42-3.58-8-8-8z'/>
1138
+ </svg>
1139
+ <span class='lc-hero-gh-text'>GitHub</span>
1140
+ <img class='lc-hero-gh-stars' alt='stars'
1141
+ src='https://img.shields.io/github/stars/FlyTOmeLight/llm-cal?style=flat-square&logo=&label=&color=eef2ff&labelColor=eef2ff'
1142
+ loading='lazy' />
1143
+ </a>
1144
+ </div>
1145
+ <div class='lc-hero-pitch'>
1146
+ <div class='lc-pitch-card lc-pitch-bad'>
1147
+ <div class='lc-pitch-tool'>gpu_poor</div>
1148
+ <div class='lc-pitch-num-bad'>284 GB</div>
1149
+ <div class='lc-pitch-method'>assumes pure FP8 · 假设纯 FP8</div>
1150
+ </div>
1151
+ <div class='lc-pitch-arrow'>→</div>
1152
+ <div class='lc-pitch-card lc-pitch-good'>
1153
+ <div class='lc-pitch-tool'>llm-cal</div>
1154
+ <div class='lc-pitch-num-good'>160 GB</div>
1155
+ <div class='lc-pitch-method'>reads real safetensors bytes · 读真实字节</div>
1156
+ </div>
1157
+ <div class='lc-pitch-summary'>
1158
+ <div class='lc-pitch-model'>DeepSeek-V4-Flash · H800</div>
1159
+ <div class='lc-pitch-result'>0.2% error vs 45% · 误差 0.2% vs 45%</div>
1160
+ </div>
1161
+ </div>
1162
+ </div>
1163
+ """
1164
+
1165
+
1166
+ CUSTOM_CSS = """
1167
+ /* Font stack — system fonts in both English + Chinese, no Gradio default serif */
1168
+ * {
1169
+ font-family: -apple-system, BlinkMacSystemFont, "Inter", "Helvetica Neue",
1170
+ "PingFang SC", "Microsoft YaHei", "Segoe UI", Roboto, Arial, sans-serif !important;
1171
+ }
1172
+
1173
+ /* Hide Gradio's default footer chrome that looks like part of our app */
1174
+ footer { display: none !important; }
1175
+ .show-api, .built-with, .settings { display: none !important; }
1176
+
1177
+ /* Tighter overall padding + center on wide screens — without margin:auto the
1178
+ container left-aligns and leaves ~800px empty on 1920+ displays.
1179
+ width:100% makes it shrink to viewport when narrower than max-width
1180
+ (otherwise on mobile align-items:stretch + max-width overflows). */
1181
+ .gradio-container {
1182
+ max-width: 1100px !important;
1183
+ width: 100% !important;
1184
+ margin-left: auto !important;
1185
+ margin-right: auto !important;
1186
+ }
1187
+
1188
+ /* Hero section */
1189
+ .lc-hero {
1190
+ margin: 8px 0 24px 0;
1191
+ padding: 24px 0 18px 0;
1192
+ border-bottom: 1px solid #e5e7eb;
1193
+ }
1194
+ .dark .lc-hero { border-bottom-color: #374151; }
1195
+
1196
+ /* Top row: title block (left) + GitHub link (right). On mobile the GH link
1197
+ wraps to its own line above or below the title — order kept so it stays
1198
+ visible above the fold. */
1199
+ .lc-hero-top {
1200
+ display: flex;
1201
+ align-items: flex-start;
1202
+ justify-content: space-between;
1203
+ gap: 16px;
1204
+ flex-wrap: wrap;
1205
+ margin-bottom: 14px;
1206
+ }
1207
+ .lc-hero-titleblock {
1208
+ flex: 1 1 320px;
1209
+ min-width: 0;
1210
+ }
1211
+ .lc-hero-gh {
1212
+ display: inline-flex;
1213
+ align-items: center;
1214
+ gap: 8px;
1215
+ padding: 6px 12px;
1216
+ border: 1px solid #c7d2fe;
1217
+ background: #eef2ff;
1218
+ border-radius: 999px;
1219
+ font-size: 13px !important;
1220
+ font-weight: 600 !important;
1221
+ color: #4338ca !important;
1222
+ text-decoration: none !important;
1223
+ white-space: nowrap;
1224
+ transition: background 0.15s ease, border-color 0.15s ease;
1225
+ flex: 0 0 auto;
1226
+ }
1227
+ .lc-hero-gh:hover {
1228
+ background: #e0e7ff;
1229
+ border-color: #a5b4fc;
1230
+ }
1231
+ .dark .lc-hero-gh {
1232
+ background: #1e1b4b;
1233
+ border-color: #3730a3;
1234
+ color: #c7d2fe !important;
1235
+ }
1236
+ .dark .lc-hero-gh:hover { background: #312e81; border-color: #4338ca; }
1237
+ .lc-hero-gh svg { display: block; }
1238
+ .lc-hero-gh-stars {
1239
+ height: 18px;
1240
+ vertical-align: middle;
1241
+ border-radius: 4px;
1242
+ }
1243
+
1244
+ .lc-hero-title {
1245
+ font-size: 32px !important;
1246
+ font-weight: 800 !important;
1247
+ letter-spacing: -0.02em;
1248
+ color: #0f172a !important;
1249
+ margin: 0 !important;
1250
+ line-height: 1.15;
1251
+ }
1252
+ .dark .lc-hero-title { color: #f8fafc !important; }
1253
+ .lc-hero-tagline {
1254
+ font-size: 16px !important;
1255
+ color: #6b7280 !important;
1256
+ margin: 6px 0 16px 0;
1257
+ line-height: 1.5;
1258
+ }
1259
+ .lc-hero-pitch {
1260
+ display: grid;
1261
+ /* 4 cells: bad-card / arrow / good-card / summary on wide screens */
1262
+ grid-template-columns: 1fr 30px 1fr 1.2fr;
1263
+ gap: 14px;
1264
+ align-items: stretch;
1265
+ padding: 0;
1266
+ font-size: 13px !important;
1267
+ color: #1e293b !important;
1268
+ }
1269
+ .dark .lc-hero-pitch { color: #f1f5f9 !important; }
1270
+
1271
+ /* Tablet: bad / arrow / good in row 1, summary full-width row 2 */
1272
+ @media (max-width: 900px) {
1273
+ .lc-hero-pitch {
1274
+ grid-template-columns: 1fr 28px 1fr;
1275
+ grid-template-rows: auto auto;
1276
+ }
1277
+ .lc-pitch-summary { grid-column: 1 / -1; }
1278
+ }
1279
+
1280
+ /* Mobile: stack everything, hide the arrow */
1281
+ @media (max-width: 540px) {
1282
+ .lc-hero-pitch {
1283
+ grid-template-columns: 1fr;
1284
+ grid-template-rows: repeat(3, auto);
1285
+ }
1286
+ .lc-pitch-arrow { display: none; }
1287
+ .lc-pitch-summary { grid-column: auto; }
1288
+ }
1289
+
1290
+ .lc-pitch-card {
1291
+ padding: 14px 18px;
1292
+ border-radius: 10px;
1293
+ border: 1px solid #e5e7eb;
1294
+ background: #ffffff;
1295
+ display: flex;
1296
+ flex-direction: column;
1297
+ justify-content: center;
1298
+ min-width: 0;
1299
+ }
1300
+ .dark .lc-pitch-card { background: #111827; border-color: #374151; }
1301
+ /* Subtle accent bar on the left, not a screaming red/green border */
1302
+ .lc-pitch-bad { border-left: 3px solid #cbd5e1; }
1303
+ .lc-pitch-good { border-left: 3px solid #4f46e5; }
1304
+ .dark .lc-pitch-bad { border-left-color: #475569; }
1305
+ .dark .lc-pitch-good { border-left-color: #818cf8; }
1306
+
1307
+ .lc-pitch-tool {
1308
+ font-size: 12px !important;
1309
+ font-weight: 600 !important;
1310
+ color: #6b7280 !important;
1311
+ font-family: "SF Mono", "JetBrains Mono", Menlo, monospace !important;
1312
+ margin-bottom: 4px;
1313
+ }
1314
+ .lc-pitch-num-bad { font-size: 24px !important; font-weight: 800 !important; color: #b91c1c !important; line-height: 1.1; letter-spacing: -0.01em; }
1315
+ .lc-pitch-num-good { font-size: 24px !important; font-weight: 800 !important; color: #15803d !important; line-height: 1.1; letter-spacing: -0.01em; }
1316
+ .dark .lc-pitch-num-bad { color: #f87171 !important; }
1317
+ .dark .lc-pitch-num-good { color: #4ade80 !important; }
1318
+ .lc-pitch-method {
1319
+ font-size: 11px !important;
1320
+ color: #6b7280 !important;
1321
+ margin-top: 6px;
1322
+ line-height: 1.4;
1323
+ }
1324
+
1325
+ .lc-pitch-arrow {
1326
+ display: flex;
1327
+ align-items: center;
1328
+ font-size: 22px !important;
1329
+ color: #9ca3af !important;
1330
+ font-weight: 300;
1331
+ }
1332
+
1333
+ .lc-pitch-summary {
1334
+ flex: 1 1 200px;
1335
+ padding: 14px 18px;
1336
+ border-radius: 10px;
1337
+ background: #eef2ff;
1338
+ border: 1px solid #c7d2fe;
1339
+ display: flex;
1340
+ flex-direction: column;
1341
+ justify-content: center;
1342
+ }
1343
+ .dark .lc-pitch-summary { background: #1e1b4b; border-color: #3730a3; }
1344
+ .lc-pitch-model {
1345
+ font-size: 11px !important;
1346
+ font-weight: 600 !important;
1347
+ text-transform: uppercase;
1348
+ letter-spacing: 0.06em;
1349
+ color: #6366f1 !important;
1350
+ margin-bottom: 4px;
1351
+ }
1352
+ .dark .lc-pitch-model { color: #a5b4fc !important; }
1353
+ .lc-pitch-result {
1354
+ font-size: 14px !important;
1355
+ font-weight: 700 !important;
1356
+ color: #312e81 !important;
1357
+ }
1358
+ .dark .lc-pitch-result { color: #e0e7ff !important; }
1359
+
1360
+ /* Primary button — match the indigo theme; constrain width so it's not a billboard */
1361
+ button.primary,
1362
+ button[variant="primary"],
1363
+ .primary > button {
1364
+ background: #4f46e5 !important;
1365
+ border-color: #4f46e5 !important;
1366
+ color: #ffffff !important;
1367
+ font-weight: 600 !important;
1368
+ letter-spacing: 0.01em;
1369
+ border-radius: 8px !important;
1370
+ padding: 10px 28px !important;
1371
+ }
1372
+ button.primary:hover,
1373
+ button[variant="primary"]:hover,
1374
+ .primary > button:hover { background: #4338ca !important; border-color: #4338ca !important; }
1375
+
1376
+ /* The wrapper around the Calculate button — center it, give it sane width */
1377
+ .lc-submit-wrap {
1378
+ display: flex !important;
1379
+ justify-content: center !important;
1380
+ margin: 20px 0 8px 0 !important;
1381
+ }
1382
+ .lc-submit-wrap button {
1383
+ min-width: 220px !important;
1384
+ max-width: 320px !important;
1385
+ width: auto !important;
1386
+ }
1387
+
1388
+ /* Form labels — kill Gradio's purple chip; make labels plain uppercase small text */
1389
+ [data-testid="block-info"] {
1390
+ background: transparent !important;
1391
+ border: none !important;
1392
+ padding: 0 !important;
1393
+ margin: 0 0 6px 0 !important;
1394
+ font-size: 11px !important;
1395
+ font-weight: 600 !important;
1396
+ text-transform: uppercase !important;
1397
+ letter-spacing: 0.05em !important;
1398
+ color: #6b7280 !important;
1399
+ border-radius: 0 !important;
1400
+ display: block !important;
1401
+ }
1402
+ .dark [data-testid="block-info"] { color: #9ca3af !important; }
1403
+
1404
+ /* Tooltip / info-text — single line, secondary color, no italic */
1405
+ .info-text {
1406
+ font-size: 11px !important;
1407
+ color: #94a3b8 !important;
1408
+ margin: 0 0 4px 0 !important;
1409
+ line-height: 1.4 !important;
1410
+ padding: 0 !important;
1411
+ font-style: normal !important;
1412
+ white-space: normal !important;
1413
+ }
1414
+ .info-text br { display: none !important; }
1415
+ .dark .info-text { color: #64748b !important; }
1416
+
1417
+ /* Kill Gradio's grey form-panel chrome entirely — labels + inputs float on the page */
1418
+ .block,
1419
+ .block.padded,
1420
+ .block.gradio-container,
1421
+ .form,
1422
+ .row,
1423
+ [data-testid="block"] {
1424
+ background: transparent !important;
1425
+ border: none !important;
1426
+ box-shadow: none !important;
1427
+ }
1428
+ .block.padded { padding: 6px 0 !important; }
1429
+ .form { padding: 0 !important; }
1430
+ .row { padding: 0 !important; }
1431
+
1432
+ /* Tighten row gap so inputs cluster more naturally */
1433
+ .form, .row { gap: 16px !important; }
1434
+
1435
+ /* Tablet (≤900px): Gradio's gr.Row() flex-direction: row keeps 3 inputs
1436
+ in one line. min-width: 320px forces 3-column rows to wrap to 2x1 +
1437
+ 1x1 at this size while leaving 2-column rows at 2-up. */
1438
+ @media (max-width: 900px) {
1439
+ .form,
1440
+ .row {
1441
+ flex-wrap: wrap !important;
1442
+ }
1443
+ .form > .block,
1444
+ .row > .block {
1445
+ flex: 1 1 calc(50% - 12px) !important;
1446
+ min-width: 320px !important;
1447
+ max-width: 100% !important;
1448
+ }
1449
+ }
1450
+
1451
+ /* Mobile (≤540px): single-column form. */
1452
+ @media (max-width: 540px) {
1453
+ .form,
1454
+ .row {
1455
+ flex-direction: column !important;
1456
+ }
1457
+ .form > .block,
1458
+ .row > .block {
1459
+ flex: 1 1 100% !important;
1460
+ min-width: 0 !important;
1461
+ width: 100% !important;
1462
+ }
1463
+ .gradio-container { padding: 12px !important; }
1464
+ .lc-hero-title { font-size: 26px !important; }
1465
+ .lc-pitch-num-bad, .lc-pitch-num-good { font-size: 22px !important; }
1466
+ .lc-pitch-arrow { display: none !important; }
1467
+ }
1468
+
1469
+ /* Inputs themselves — light border, soft fill */
1470
+ input[type="text"],
1471
+ input[type="number"],
1472
+ input[type="password"],
1473
+ textarea,
1474
+ select {
1475
+ border: 1px solid #e5e7eb !important;
1476
+ border-radius: 8px !important;
1477
+ background: #ffffff !important;
1478
+ font-size: 14px !important;
1479
+ padding: 10px 12px !important;
1480
+ }
1481
+ .dark input,
1482
+ .dark textarea,
1483
+ .dark select {
1484
+ background: #111827 !important;
1485
+ border-color: #374151 !important;
1486
+ }
1487
+ input:focus,
1488
+ textarea:focus {
1489
+ border-color: #4f46e5 !important;
1490
+ outline: none !important;
1491
+ box-shadow: 0 0 0 3px rgba(79,70,229,0.12) !important;
1492
+ }
1493
+
1494
+ /* Accordion — Gradio 6 has no .accordion class; the only signal is a .block
1495
+ that *contains* a button.label-wrap. Use :has() to match precisely. */
1496
+ .block.padded:has(> button.label-wrap) {
1497
+ background: #ffffff !important;
1498
+ border: 1px solid #e5e7eb !important;
1499
+ border-radius: 10px !important;
1500
+ margin: 14px 0 !important;
1501
+ padding: 0 !important;
1502
+ overflow: hidden !important;
1503
+ }
1504
+ .dark .block.padded:has(> button.label-wrap) {
1505
+ background: #111827 !important;
1506
+ border-color: #374151 !important;
1507
+ }
1508
+ button.label-wrap {
1509
+ background: #f8fafc !important;
1510
+ padding: 14px 18px !important;
1511
+ font-weight: 600 !important;
1512
+ font-size: 14px !important;
1513
+ color: #1f2937 !important;
1514
+ width: 100% !important;
1515
+ text-align: left !important;
1516
+ cursor: pointer !important;
1517
+ border: none !important;
1518
+ border-bottom: 1px solid #e5e7eb !important;
1519
+ display: flex !important;
1520
+ justify-content: space-between !important;
1521
+ align-items: center !important;
1522
+ letter-spacing: 0.01em;
1523
+ }
1524
+ .dark button.label-wrap {
1525
+ background: #1e293b !important;
1526
+ color: #f1f5f9 !important;
1527
+ border-bottom-color: #374151 !important;
1528
+ }
1529
+ button.label-wrap:hover { background: #f1f5f9 !important; }
1530
+ .dark button.label-wrap:hover { background: #334155 !important; }
1531
+ /* Sibling content of the header (the body when expanded) */
1532
+ .block.padded:has(> button.label-wrap) > *:not(button.label-wrap) {
1533
+ padding: 16px 18px !important;
1534
+ background: #ffffff !important;
1535
+ }
1536
+ .dark .block.padded:has(> button.label-wrap) > *:not(button.label-wrap) {
1537
+ background: #111827 !important;
1538
+ }
1539
+
1540
+ /* gr.Examples table — the default Gradio render is a raw HTML table with black
1541
+ borders and no hover state. Style it to match the rest of the page. */
1542
+ .gradio-dataset,
1543
+ [data-testid="dataset"] {
1544
+ margin-top: 24px !important;
1545
+ background: transparent !important;
1546
+ border: none !important;
1547
+ }
1548
+ .gradio-dataset table,
1549
+ [data-testid="dataset"] table {
1550
+ border-collapse: collapse !important;
1551
+ border: 1px solid #e5e7eb !important;
1552
+ border-radius: 8px !important;
1553
+ overflow: hidden !important;
1554
+ font-size: 13px !important;
1555
+ width: 100% !important;
1556
+ }
1557
+ .dark .gradio-dataset table,
1558
+ .dark [data-testid="dataset"] table { border-color: #374151 !important; }
1559
+ .gradio-dataset thead,
1560
+ [data-testid="dataset"] thead { background: #f9fafb !important; }
1561
+ .dark .gradio-dataset thead,
1562
+ .dark [data-testid="dataset"] thead { background: #111827 !important; }
1563
+ .gradio-dataset th,
1564
+ [data-testid="dataset"] th {
1565
+ font-size: 11px !important;
1566
+ font-weight: 600 !important;
1567
+ text-transform: uppercase !important;
1568
+ letter-spacing: 0.05em !important;
1569
+ color: #6b7280 !important;
1570
+ text-align: left !important;
1571
+ padding: 10px 12px !important;
1572
+ border: none !important;
1573
+ border-bottom: 1px solid #e5e7eb !important;
1574
+ }
1575
+ .gradio-dataset td,
1576
+ [data-testid="dataset"] td {
1577
+ padding: 9px 12px !important;
1578
+ border: none !important;
1579
+ border-bottom: 1px solid #f3f4f6 !important;
1580
+ color: #1f2937 !important;
1581
+ font-size: 13px !important;
1582
+ background: transparent !important;
1583
+ cursor: pointer !important;
1584
+ }
1585
+ .dark .gradio-dataset td,
1586
+ .dark [data-testid="dataset"] td {
1587
+ color: #e5e7eb !important;
1588
+ border-bottom-color: #1f2937 !important;
1589
+ }
1590
+ .gradio-dataset tbody tr:last-child td,
1591
+ [data-testid="dataset"] tbody tr:last-child td { border-bottom: none !important; }
1592
+ .gradio-dataset tbody tr:hover,
1593
+ [data-testid="dataset"] tbody tr:hover { background: rgba(79, 70, 229, 0.04) !important; }
1594
+ .dark .gradio-dataset tbody tr:hover,
1595
+ .dark [data-testid="dataset"] tbody tr:hover { background: rgba(129, 140, 248, 0.08) !important; }
1596
+
1597
+ /* Examples header label — Gradio puts a "Try one of these" label above */
1598
+ .gradio-dataset > .label,
1599
+ [data-testid="dataset"] > .label,
1600
+ .gradio-dataset .block-label,
1601
+ .dataset .block-label {
1602
+ font-size: 11px !important;
1603
+ font-weight: 600 !important;
1604
+ text-transform: uppercase !important;
1605
+ letter-spacing: 0.06em !important;
1606
+ color: #6b7280 !important;
1607
+ background: transparent !important;
1608
+ border: none !important;
1609
+ padding: 0 0 6px 0 !important;
1610
+ margin-bottom: 0 !important;
1611
+ }
1612
+
1613
+ /* Footer link strip */
1614
+ .lc-footer {
1615
+ margin-top: 28px;
1616
+ padding: 14px 0;
1617
+ border-top: 1px solid #e5e7eb;
1618
+ font-size: 13px !important;
1619
+ color: #6b7280 !important;
1620
+ }
1621
+ .dark .lc-footer { border-top-color: #374151; }
1622
+ .lc-footer a { color: #4f46e5 !important; text-decoration: none; }
1623
+ .lc-footer a:hover { text-decoration: underline; }
1624
+ .dark .lc-footer a { color: #818cf8 !important; }
1625
+
1626
+ /* Result wrapper */
1627
+ .lc-result {
1628
+ padding: 4px 0;
1629
+ font-size: 14px;
1630
+ line-height: 1.55;
1631
+ color: #111827 !important;
1632
+ }
1633
+ .dark .lc-result { color: #f3f4f6 !important; }
1634
+
1635
+ /* Headline */
1636
+ .lc-header { padding: 4px 0 14px 0; border-bottom: 1px solid #e5e7eb; }
1637
+ .dark .lc-header { border-bottom-color: #374151; }
1638
+ .lc-title {
1639
+ font-size: 22px !important;
1640
+ font-weight: 700 !important;
1641
+ letter-spacing: -0.01em;
1642
+ color: #0f172a !important;
1643
+ }
1644
+ .dark .lc-title { color: #f8fafc !important; }
1645
+ .lc-subtitle {
1646
+ font-size: 13px !important;
1647
+ color: #6b7280 !important;
1648
+ margin-top: 2px;
1649
+ }
1650
+
1651
+ /* Headline stat cards */
1652
+ .lc-stats {
1653
+ display: grid;
1654
+ grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
1655
+ gap: 12px;
1656
+ margin: 16px 0 8px 0;
1657
+ }
1658
+ .lc-stat {
1659
+ border: 1px solid #e5e7eb;
1660
+ border-radius: 10px;
1661
+ padding: 14px 16px;
1662
+ background: #ffffff;
1663
+ }
1664
+ .dark .lc-stat { background: #111827; border-color: #374151; }
1665
+ .lc-stat-value {
1666
+ font-size: 24px !important;
1667
+ font-weight: 700 !important;
1668
+ letter-spacing: -0.01em;
1669
+ line-height: 1.2;
1670
+ color: #0f172a !important;
1671
+ }
1672
+ .dark .lc-stat-value { color: #f8fafc !important; }
1673
+ .lc-stat-label {
1674
+ font-size: 11px !important;
1675
+ text-transform: uppercase;
1676
+ letter-spacing: 0.05em;
1677
+ color: #6b7280 !important;
1678
+ margin-top: 4px;
1679
+ font-weight: 500 !important;
1680
+ }
1681
+ .lc-stat-sub {
1682
+ font-size: 11px !important;
1683
+ color: #9ca3af !important;
1684
+ margin-top: 2px;
1685
+ }
1686
+ .lc-stat-chip { margin-top: 10px; }
1687
+
1688
+ .lc-chip {
1689
+ display: inline-block;
1690
+ padding: 2px 8px;
1691
+ border-radius: 999px;
1692
+ font-size: 11px !important;
1693
+ font-weight: 600 !important;
1694
+ letter-spacing: 0.02em;
1695
+ }
1696
+
1697
+ .lc-prov {
1698
+ margin-top: 6px;
1699
+ font-size: 12px !important;
1700
+ color: #6b7280 !important;
1701
+ font-style: italic;
1702
+ }
1703
+
1704
+ /* Sections */
1705
+ .lc-section { margin: 24px 0 0 0; }
1706
+ .lc-section h3 {
1707
+ font-size: 13px !important;
1708
+ font-weight: 600 !important;
1709
+ text-transform: uppercase;
1710
+ letter-spacing: 0.06em;
1711
+ color: #6b7280 !important;
1712
+ margin: 0 0 6px 0 !important;
1713
+ }
1714
+ .lc-section-help {
1715
+ font-size: 12px !important;
1716
+ color: #6b7280 !important;
1717
+ margin: 0 0 10px 0;
1718
+ line-height: 1.5;
1719
+ }
1720
+
1721
+ /* Tables */
1722
+ .lc-table {
1723
+ width: 100%;
1724
+ border-collapse: collapse;
1725
+ font-size: 13px !important;
1726
+ color: #111827 !important;
1727
+ }
1728
+ .dark .lc-table { color: #f3f4f6 !important; }
1729
+ .lc-table th, .lc-table td {
1730
+ padding: 8px 10px;
1731
+ border-bottom: 1px solid #f3f4f6;
1732
+ text-align: left;
1733
+ }
1734
+ .dark .lc-table th, .dark .lc-table td { border-bottom-color: #1f2937; }
1735
+ .lc-table th {
1736
+ font-size: 11px !important;
1737
+ text-transform: uppercase;
1738
+ letter-spacing: 0.04em;
1739
+ color: #6b7280 !important;
1740
+ font-weight: 500 !important;
1741
+ }
1742
+ .lc-table-recon td:nth-child(2),
1743
+ .lc-table-recon td:nth-child(3) { text-align: right; }
1744
+ .lc-best { background: rgba(22, 163, 74, 0.08); }
1745
+ .dark .lc-best { background: rgba(22, 163, 74, 0.18); }
1746
+
1747
+ /* Performance grid */
1748
+ .lc-perf {
1749
+ display: grid;
1750
+ grid-template-columns: repeat(auto-fit, minmax(170px, 1fr));
1751
+ gap: 12px;
1752
+ }
1753
+ .lc-perf-item {
1754
+ border: 1px solid #e5e7eb;
1755
+ border-radius: 10px;
1756
+ padding: 12px 14px;
1757
+ background: #ffffff;
1758
+ }
1759
+ .dark .lc-perf-item { border-color: #374151; background: #111827; }
1760
+ .lc-perf-value {
1761
+ font-size: 20px !important;
1762
+ font-weight: 700 !important;
1763
+ letter-spacing: -0.01em;
1764
+ color: #0f172a !important;
1765
+ line-height: 1.2;
1766
+ }
1767
+ .dark .lc-perf-value { color: #f8fafc !important; }
1768
+ .lc-perf-value code {
1769
+ font-size: 16px !important;
1770
+ font-weight: 600 !important;
1771
+ background: transparent !important;
1772
+ color: #0f172a !important;
1773
+ padding: 0 !important;
1774
+ }
1775
+ .dark .lc-perf-value code { color: #f8fafc !important; }
1776
+ .lc-perf-label {
1777
+ font-size: 11px !important;
1778
+ text-transform: uppercase;
1779
+ letter-spacing: 0.05em;
1780
+ color: #6b7280 !important;
1781
+ margin-top: 4px;
1782
+ font-weight: 500 !important;
1783
+ }
1784
+ .lc-perf-sub {
1785
+ font-size: 11px !important;
1786
+ color: #9ca3af !important;
1787
+ margin-top: 1px;
1788
+ }
1789
+
1790
+ /* Inline code */
1791
+ .lc-result code {
1792
+ font-family: "SF Mono", "JetBrains Mono", Menlo, Consolas, monospace !important;
1793
+ font-size: 0.92em !important;
1794
+ color: #0f172a !important;
1795
+ background: rgba(15, 23, 42, 0.06);
1796
+ padding: 1px 5px;
1797
+ border-radius: 4px;
1798
+ }
1799
+ .dark .lc-result code {
1800
+ color: #e2e8f0 !important;
1801
+ background: rgba(226, 232, 240, 0.08);
1802
+ }
1803
+
1804
+ /* Generated command — ALWAYS dark theme regardless of mode */
1805
+ .lc-cmd {
1806
+ background: #0b1220 !important;
1807
+ color: #f1f5f9 !important;
1808
+ padding: 16px 18px !important;
1809
+ border-radius: 8px;
1810
+ font-size: 12.5px !important;
1811
+ overflow-x: auto;
1812
+ white-space: pre;
1813
+ border: 1px solid #1e293b !important;
1814
+ margin: 0 !important;
1815
+ }
1816
+ .lc-cmd code {
1817
+ font-family: "SF Mono", "JetBrains Mono", Menlo, Consolas, monospace !important;
1818
+ background: transparent !important;
1819
+ color: #f1f5f9 !important;
1820
+ padding: 0 !important;
1821
+ font-size: 12.5px !important;
1822
+ border-radius: 0 !important;
1823
+ }
1824
+
1825
+ /* Comparison view — side-by-side metrics across GPUs */
1826
+ .lc-cmp-wrap {
1827
+ overflow-x: auto;
1828
+ margin: 8px 0 12px 0;
1829
+ border: 1px solid #e5e7eb;
1830
+ border-radius: 10px;
1831
+ background: #ffffff;
1832
+ }
1833
+ .dark .lc-cmp-wrap { background: #111827; border-color: #374151; }
1834
+ .lc-cmp-table {
1835
+ width: 100%;
1836
+ border-collapse: collapse;
1837
+ font-size: 13px !important;
1838
+ }
1839
+ .lc-cmp-table th,
1840
+ .lc-cmp-table td {
1841
+ padding: 10px 12px;
1842
+ text-align: left;
1843
+ border-bottom: 1px solid #f3f4f6;
1844
+ }
1845
+ .dark .lc-cmp-table th,
1846
+ .dark .lc-cmp-table td { border-bottom-color: #1f2937; }
1847
+ .lc-cmp-table thead th {
1848
+ font-size: 11px !important;
1849
+ text-transform: uppercase;
1850
+ letter-spacing: 0.05em;
1851
+ color: #6b7280 !important;
1852
+ font-weight: 600 !important;
1853
+ background: #f9fafb;
1854
+ }
1855
+ .dark .lc-cmp-table thead th { background: #1e293b; color: #9ca3af !important; }
1856
+ .lc-cmp-row-label {
1857
+ font-size: 12px !important;
1858
+ color: #6b7280 !important;
1859
+ font-weight: 600 !important;
1860
+ white-space: nowrap;
1861
+ }
1862
+ .lc-cmp-row-info {
1863
+ font-style: italic;
1864
+ color: #9ca3af !important;
1865
+ }
1866
+ .dark .lc-cmp-row-info { color: #6b7280 !important; }
1867
+ .lc-cmp-tr-info td {
1868
+ color: #6b7280;
1869
+ background: #fafafa;
1870
+ }
1871
+ .dark .lc-cmp-tr-info td { color: #9ca3af; background: #0f172a; }
1872
+ .lc-cmp-gpu {
1873
+ font-family: "SF Mono", "JetBrains Mono", Menlo, monospace !important;
1874
+ font-size: 12px !important;
1875
+ }
1876
+ .lc-cmp-table tbody tr:last-child td { border-bottom: none; }
1877
+ .lc-cmp-winner {
1878
+ background: rgba(22, 163, 74, 0.10) !important;
1879
+ font-weight: 700 !important;
1880
+ color: #15803d !important;
1881
+ position: relative;
1882
+ }
1883
+ .dark .lc-cmp-winner { background: rgba(74, 222, 128, 0.15) !important; color: #4ade80 !important; }
1884
+ .lc-cmp-winner::before {
1885
+ content: "✓ ";
1886
+ font-size: 11px;
1887
+ font-weight: 700;
1888
+ color: #15803d;
1889
+ margin-right: 2px;
1890
+ }
1891
+ .dark .lc-cmp-winner::before { color: #4ade80; }
1892
+ .lc-cmp-summary {
1893
+ margin-top: 12px;
1894
+ padding: 12px 14px;
1895
+ border-radius: 8px;
1896
+ background: #eef2ff;
1897
+ border: 1px solid #c7d2fe;
1898
+ font-size: 13px !important;
1899
+ color: #312e81 !important;
1900
+ }
1901
+ .dark .lc-cmp-summary {
1902
+ background: #1e1b4b;
1903
+ border-color: #3730a3;
1904
+ color: #e0e7ff !important;
1905
+ }
1906
+ .lc-cmp-summary strong { color: #4338ca; }
1907
+ .dark .lc-cmp-summary strong { color: #a5b4fc; }
1908
+
1909
+ /* Per-GPU detail cards under the table */
1910
+ .lc-cmp-details {
1911
+ display: grid;
1912
+ grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
1913
+ gap: 12px;
1914
+ }
1915
+ .lc-cmp-detail {
1916
+ border: 1px solid #e5e7eb;
1917
+ border-radius: 10px;
1918
+ padding: 12px 14px;
1919
+ background: #ffffff;
1920
+ }
1921
+ .dark .lc-cmp-detail { background: #111827; border-color: #374151; }
1922
+ .lc-cmp-detail-gpu {
1923
+ font-family: "SF Mono", "JetBrains Mono", Menlo, monospace !important;
1924
+ font-size: 13px !important;
1925
+ font-weight: 700 !important;
1926
+ color: #0f172a !important;
1927
+ margin-bottom: 6px;
1928
+ padding-bottom: 6px;
1929
+ border-bottom: 1px solid #e5e7eb;
1930
+ }
1931
+ .dark .lc-cmp-detail-gpu { color: #f8fafc !important; border-bottom-color: #374151; }
1932
+ .lc-cmp-detail-row {
1933
+ display: flex;
1934
+ justify-content: space-between;
1935
+ font-size: 12px !important;
1936
+ padding: 3px 0;
1937
+ }
1938
+ .lc-cmp-detail-row span { color: #6b7280 !important; }
1939
+ .lc-cmp-detail-row strong {
1940
+ color: #0f172a !important;
1941
+ font-size: 13px !important;
1942
+ }
1943
+ .dark .lc-cmp-detail-row strong { color: #f8fafc !important; }
1944
+
1945
+ /* Star-on-GitHub CTA — shown at the bottom of the result, capturing the
1946
+ peak-satisfaction moment. Card-style with indigo accent so it reads as
1947
+ "thanks", not as a banner ad. */
1948
+ .lc-star-cta {
1949
+ display: flex;
1950
+ align-items: center;
1951
+ gap: 14px;
1952
+ margin: 28px 0 8px 0;
1953
+ padding: 14px 18px;
1954
+ border: 1px solid #c7d2fe;
1955
+ background: #eef2ff;
1956
+ border-radius: 10px;
1957
+ text-decoration: none !important;
1958
+ color: #312e81 !important;
1959
+ transition: background 0.15s ease, border-color 0.15s ease, transform 0.1s ease;
1960
+ }
1961
+ .lc-star-cta:hover {
1962
+ background: #e0e7ff;
1963
+ border-color: #a5b4fc;
1964
+ }
1965
+ .lc-star-cta:active { transform: scale(0.995); }
1966
+ .dark .lc-star-cta {
1967
+ background: #1e1b4b;
1968
+ border-color: #3730a3;
1969
+ color: #c7d2fe !important;
1970
+ }
1971
+ .dark .lc-star-cta:hover { background: #312e81; }
1972
+ .lc-star-cta svg { flex: 0 0 auto; color: #4338ca; }
1973
+ .dark .lc-star-cta svg { color: #a5b4fc; }
1974
+ .lc-star-cta-text { flex: 1 1 auto; min-width: 0; }
1975
+ .lc-star-cta-q {
1976
+ font-size: 14px !important;
1977
+ font-weight: 600 !important;
1978
+ line-height: 1.3;
1979
+ color: #312e81 !important;
1980
+ }
1981
+ .dark .lc-star-cta-q { color: #e0e7ff !important; }
1982
+ .lc-star-cta-q-en {
1983
+ font-size: 12px !important;
1984
+ color: #6366f1 !important;
1985
+ margin-top: 2px;
1986
+ line-height: 1.3;
1987
+ }
1988
+ .dark .lc-star-cta-q-en { color: #a5b4fc !important; }
1989
+ .lc-star-cta-action {
1990
+ flex: 0 0 auto;
1991
+ font-size: 13px !important;
1992
+ font-weight: 700 !important;
1993
+ color: #4338ca !important;
1994
+ white-space: nowrap;
1995
+ }
1996
+ .dark .lc-star-cta-action { color: #c7d2fe !important; }
1997
+ @media (max-width: 540px) {
1998
+ .lc-star-cta { flex-wrap: wrap; gap: 10px; }
1999
+ .lc-star-cta-action { flex-basis: 100%; }
2000
+ }
2001
+
2002
+ /* Loading + error */
2003
+ .lc-loading {
2004
+ display: flex;
2005
+ align-items: center;
2006
+ gap: 14px;
2007
+ padding: 24px;
2008
+ color: #6b7280 !important;
2009
+ font-size: 14px !important;
2010
+ }
2011
+ .lc-spinner {
2012
+ width: 18px; height: 18px;
2013
+ border: 2px solid #cbd5e1;
2014
+ border-top-color: #4f46e5;
2015
+ border-radius: 50%;
2016
+ animation: lc-spin 0.7s linear infinite;
2017
+ flex: none;
2018
+ }
2019
+ @keyframes lc-spin { to { transform: rotate(360deg); } }
2020
+
2021
+ .lc-error pre {
2022
+ background: #fef2f2;
2023
+ color: #991b1b !important;
2024
+ padding: 12px 14px;
2025
+ border-radius: 8px;
2026
+ border: 1px solid #fecaca;
2027
+ font-size: 12px !important;
2028
+ white-space: pre-wrap;
2029
+ word-break: break-word;
2030
+ margin: 0;
2031
+ }
2032
+ .dark .lc-error pre { background: #450a0a; color: #fca5a5 !important; border-color: #7f1d1d; }
2033
+
2034
+ /* Explain trace */
2035
+ .lc-explain-entry {
2036
+ margin: 14px 0;
2037
+ padding: 14px 16px;
2038
+ border: 1px solid #e5e7eb;
2039
+ border-left: 3px solid #4f46e5;
2040
+ border-radius: 8px;
2041
+ background: #fafafa;
2042
+ }
2043
+ .dark .lc-explain-entry { background: #0f172a; border-color: #374151; border-left-color: #818cf8; }
2044
+ .lc-explain-heading {
2045
+ font-weight: 700 !important;
2046
+ font-size: 14px !important;
2047
+ margin-bottom: 8px;
2048
+ color: #0f172a !important;
2049
+ }
2050
+ .dark .lc-explain-heading { color: #f8fafc !important; }
2051
+ .lc-explain-formula {
2052
+ margin: 6px 0;
2053
+ font-size: 12.5px !important;
2054
+ }
2055
+ .lc-explain-formula code {
2056
+ background: rgba(79, 70, 229, 0.08) !important;
2057
+ color: #4338ca !important;
2058
+ padding: 4px 8px !important;
2059
+ border-radius: 4px;
2060
+ }
2061
+ .dark .lc-explain-formula code { color: #a5b4fc !important; background: rgba(165, 180, 252, 0.12) !important; }
2062
+ .lc-explain-inputs, .lc-explain-steps {
2063
+ margin: 6px 0 6px 1.2em;
2064
+ font-size: 12.5px !important;
2065
+ line-height: 1.7;
2066
+ }
2067
+ .lc-explain-label {
2068
+ font-size: 11px !important;
2069
+ color: #6b7280 !important;
2070
+ font-style: italic;
2071
+ }
2072
+ .lc-explain-result {
2073
+ margin-top: 8px;
2074
+ padding-top: 8px;
2075
+ border-top: 1px dashed #e5e7eb;
2076
+ font-size: 13px !important;
2077
+ color: #0f172a !important;
2078
+ }
2079
+ .dark .lc-explain-result { color: #f8fafc !important; border-top-color: #374151; }
2080
+
2081
+ /* LLM review */
2082
+ .lc-llm-banner {
2083
+ display: flex;
2084
+ align-items: center;
2085
+ gap: 8px;
2086
+ padding: 8px 12px;
2087
+ background: #f9fafb;
2088
+ border: 1px solid #e5e7eb;
2089
+ border-radius: 8px;
2090
+ font-size: 12px !important;
2091
+ color: #4b5563 !important;
2092
+ margin-bottom: 12px;
2093
+ }
2094
+ .dark .lc-llm-banner { color: #d1d5db !important; background: #111827; border-color: #374151; }
2095
+ .lc-llm-model {
2096
+ font-size: 11px !important;
2097
+ color: #6b7280 !important;
2098
+ font-weight: 500 !important;
2099
+ margin-left: 6px;
2100
+ text-transform: none !important;
2101
+ letter-spacing: 0 !important;
2102
+ }
2103
+ .lc-llm-content {
2104
+ font-size: 13px !important;
2105
+ line-height: 1.7;
2106
+ color: #0f172a !important;
2107
+ padding: 12px 14px;
2108
+ border: 1px solid #e5e7eb;
2109
+ border-radius: 8px;
2110
+ background: #ffffff;
2111
+ }
2112
+ .dark .lc-llm-content { color: #f3f4f6 !important; background: #111827; border-color: #374151; }
2113
+ """
2114
+
2115
+
2116
+ def _build_ui() -> gr.Blocks:
2117
+ with gr.Blocks(title="llm-cal — LLM hardware calculator") as demo:
2118
+ gr.HTML(HERO_HTML)
2119
+
2120
+ # ---- Required ----------------------------------------------------
2121
+ with gr.Row():
2122
+ model_id = gr.Textbox(
2123
+ label="Model ID · 模型 ID",
2124
+ placeholder="e.g. deepseek-ai/DeepSeek-V4-Flash",
2125
+ info="Repo id · 仓库 ID(owner/name)",
2126
+ scale=3,
2127
+ )
2128
+ source = gr.Radio(
2129
+ choices=["HuggingFace", "ModelScope"],
2130
+ value="HuggingFace",
2131
+ label="Source · 来源",
2132
+ info="Where to pull model metadata · 拉取来源",
2133
+ scale=2,
2134
+ )
2135
+
2136
+ with gr.Row():
2137
+ vendor = gr.Dropdown(
2138
+ choices=VENDOR_CHOICES_EN,
2139
+ value=DEFAULT_VENDOR,
2140
+ label="GPU vendor · GPU 厂商",
2141
+ info="11 vendors covered · 共 11 家",
2142
+ scale=1,
2143
+ )
2144
+ gpu = gr.Dropdown(
2145
+ choices=_VENDOR_TO_GPUS[DEFAULT_VENDOR],
2146
+ value=[DEFAULT_GPU],
2147
+ label="GPU model · GPU 型号",
2148
+ info="One GPU = single eval. 2-4 = compare side-by-side · 选 1 张单评估,2-4 张对比",
2149
+ scale=2,
2150
+ multiselect=True,
2151
+ max_choices=4,
2152
+ allow_custom_value=True,
2153
+ )
2154
+
2155
+ with gr.Row():
2156
+ engine = gr.Radio(
2157
+ choices=["vllm", "sglang"],
2158
+ value="vllm",
2159
+ label="Engine · 引擎",
2160
+ info="Inference engine · 推理引擎",
2161
+ )
2162
+ context_length = gr.Number(
2163
+ label="Context length · Context 长度",
2164
+ value=None,
2165
+ precision=0,
2166
+ info="Empty = 4K/32K/128K/1M · 留空显示全档",
2167
+ )
2168
+ lang = gr.Radio(
2169
+ choices=["English", "中文"],
2170
+ value="English",
2171
+ label="Output language · 输出语言",
2172
+ info="Result area only · 仅影响下方结果区",
2173
+ )
2174
+
2175
+ # ---- Performance tuning (collapsible) ----------------------------
2176
+ with gr.Accordion("Performance tuning · 性能参数", open=False):
2177
+ with gr.Row():
2178
+ input_tokens = gr.Number(
2179
+ label="Input tokens · 输入 tokens",
2180
+ value=2000,
2181
+ precision=0,
2182
+ info="Prefill budget · Prefill 预算",
2183
+ )
2184
+ output_tokens = gr.Number(
2185
+ label="Output tokens · 输出 tokens",
2186
+ value=512,
2187
+ precision=0,
2188
+ info="Decode budget · Decode 预算",
2189
+ )
2190
+ target_tps = gr.Number(
2191
+ label="Target tok/s/user · 单用户目标 tok/s",
2192
+ value=30.0,
2193
+ info="SLA per user · 单用户 SLA(30 ≈ 流畅阅读)",
2194
+ )
2195
+ with gr.Row():
2196
+ prefill_util = gr.Number(
2197
+ label="Prefill util · Prefill 利用率",
2198
+ value=0.40,
2199
+ info="0–1 · 0.40 = vLLM paper baseline",
2200
+ )
2201
+ decode_bw_util = gr.Number(
2202
+ label="Decode BW util · Decode 带宽利用率",
2203
+ value=0.50,
2204
+ info="0–1 · 0.50 = community median",
2205
+ )
2206
+ concurrency_degradation = gr.Number(
2207
+ label="Concurrency degradation · 并发衰减",
2208
+ value=1.0,
2209
+ info="1.0 = honest · 1.67 = 60% efficiency under load",
2210
+ )
2211
+
2212
+ # ---- Advanced (collapsible) --------------------------------------
2213
+ with gr.Accordion("Advanced · 高级", open=False):
2214
+ with gr.Row():
2215
+ hf_token = gr.Textbox(
2216
+ label="HF_TOKEN",
2217
+ value="",
2218
+ placeholder="hf_...",
2219
+ type="password",
2220
+ info="For gated HF models · 私有 HF 模型用",
2221
+ )
2222
+ ms_token = gr.Textbox(
2223
+ label="MODELSCOPE_API_TOKEN",
2224
+ value="",
2225
+ placeholder="ms-...",
2226
+ type="password",
2227
+ info="For gated MS models · 私有 MS 模型用",
2228
+ )
2229
+ with gr.Row():
2230
+ gpu_count = gr.Number(
2231
+ label="Force GPU count · 强制 GPU 数",
2232
+ value=None,
2233
+ precision=0,
2234
+ info="Empty = auto min/dev/prod · 留空自动给三档",
2235
+ )
2236
+ refresh = gr.Checkbox(
2237
+ label="Refresh cache · 刷新缓存",
2238
+ value=False,
2239
+ info="Bypass diskcache · 跳过本地缓存",
2240
+ )
2241
+ with gr.Row():
2242
+ explain = gr.Checkbox(
2243
+ label="--explain · 推导链",
2244
+ value=False,
2245
+ info="Full derivation trace · 输出完整推导链",
2246
+ )
2247
+ llm_review = gr.Checkbox(
2248
+ label="--llm-review · LLM 审计",
2249
+ value=False,
2250
+ info="Second opinion from an LLM · 第二意见审计",
2251
+ )
2252
+ with gr.Row():
2253
+ llm_api_key = gr.Textbox(
2254
+ label="LLM API key · LLM API 密钥",
2255
+ value="",
2256
+ placeholder="sk-...",
2257
+ type="password",
2258
+ info="OpenAI-compatible endpoint · OpenAI 兼容端点",
2259
+ )
2260
+ llm_base_url = gr.Textbox(
2261
+ label="LLM base URL · LLM 基地址",
2262
+ value="",
2263
+ placeholder="https://api.openai.com/v1",
2264
+ info="e.g. https://api.deepseek.com/v1",
2265
+ )
2266
+ llm_model = gr.Textbox(
2267
+ label="LLM model · LLM 模型名",
2268
+ value="",
2269
+ placeholder="gpt-4o",
2270
+ info="e.g. gpt-4o / deepseek-chat / MiniMax-M2",
2271
+ )
2272
+
2273
+ with gr.Row(elem_classes="lc-submit-wrap"):
2274
+ submit = gr.Button("Calculate · 计算", variant="primary", size="lg")
2275
+
2276
+ # Three output panes — main always shows, explain/llm-review only when toggled
2277
+ output_main = gr.HTML(label="Result")
2278
+ output_explain = gr.HTML(label="Explain trace")
2279
+ output_llm = gr.HTML(label="LLM review")
2280
+
2281
+ gr.Examples(
2282
+ examples=[
2283
+ # gpu wrapped in a list — the Dropdown is multiselect now
2284
+ [m, v, [g], e, None, "English", s]
2285
+ for m, v, g, e, s in EXAMPLE_MODELS
2286
+ ],
2287
+ inputs=[model_id, vendor, gpu, engine, context_length, lang, source],
2288
+ label="Try one of these · 试试这些组合",
2289
+ )
2290
+
2291
+ gr.HTML(
2292
+ "<div class='lc-footer'>"
2293
+ "<a href='https://github.com/FlyTOmeLight/llm-cal' target='_blank'>GitHub</a> · "
2294
+ "<a href='https://flytomelight.github.io/llm-cal/' target='_blank'>Docs</a> · "
2295
+ "<a href='https://flytomelight.github.io/llm-cal/methodology/' target='_blank'>Methodology</a> · "
2296
+ "<code>pip install llm-cal</code>"
2297
+ "</div>"
2298
+ )
2299
+
2300
+ # When vendor changes, repopulate the GPU dropdown but PRESERVE any
2301
+ # cross-vendor selections (the whole point of compare mode is to
2302
+ # stack e.g. H800 + MI300X + 910B4 across NVIDIA/AMD/Ascend).
2303
+ def _on_vendor_change(v: str, current): # noqa: ANN001, ANN202
2304
+ gpus = _VENDOR_TO_GPUS.get(v, [])
2305
+ # multiselect returns list; harden against str/None for safety
2306
+ if isinstance(current, list):
2307
+ keep = list(current)
2308
+ elif current:
2309
+ keep = [current]
2310
+ else:
2311
+ keep = []
2312
+ # Empty selection? Seed with the first GPU so the form stays usable.
2313
+ if not keep:
2314
+ keep = [gpus[0]] if gpus else []
2315
+ return gr.Dropdown(choices=gpus, value=keep)
2316
+
2317
+ vendor.change(fn=_on_vendor_change, inputs=[vendor, gpu], outputs=[gpu])
2318
+
2319
+ # Click flow: instantly show "loading…", THEN run calculate.
2320
+ all_outputs = [output_main, output_explain, output_llm]
2321
+ submit.click(
2322
+ fn=show_loading,
2323
+ inputs=[lang],
2324
+ outputs=all_outputs,
2325
+ ).then(
2326
+ fn=calculate,
2327
+ inputs=[
2328
+ model_id, gpu, engine, context_length, lang, source,
2329
+ gpu_count, input_tokens, output_tokens, target_tps,
2330
+ prefill_util, decode_bw_util, concurrency_degradation,
2331
+ refresh, explain, llm_review,
2332
+ hf_token, ms_token,
2333
+ llm_api_key, llm_base_url, llm_model,
2334
+ ],
2335
+ outputs=all_outputs,
2336
+ )
2337
+
2338
+ return demo
2339
+
2340
+
2341
+ def _prewarm_cache() -> None:
2342
+ """Fill the artifact cache for every Examples row so first-click users
2343
+ don't pay the 3-8s HF/MS metadata roundtrip.
2344
+
2345
+ Runs on a daemon thread alongside the Gradio server. Failures are
2346
+ swallowed (printed only) — pre-warm is a UX nicety, never a hard
2347
+ dependency. Set LLM_CAL_PREWARM=0 to disable (useful for local dev
2348
+ when you don't want 9 API calls every time you `python web/app.py`).
2349
+ """
2350
+ import time
2351
+
2352
+ print(f"[prewarm] starting cache warm-up for {len(EXAMPLE_MODELS)} examples")
2353
+ for i, (model_id, _vendor, gpu, engine, source) in enumerate(EXAMPLE_MODELS, 1):
2354
+ src_key = "modelscope" if "modelscope" in source.lower() else "huggingface"
2355
+ label = f"{i}/{len(EXAMPLE_MODELS)} {src_key}:{model_id}"
2356
+ try:
2357
+ t0 = time.monotonic()
2358
+ _get_evaluator(src_key).evaluate(
2359
+ model_id=model_id,
2360
+ gpu=gpu,
2361
+ engine=engine,
2362
+ )
2363
+ print(f"[prewarm] {label} ok ({time.monotonic() - t0:.1f}s)")
2364
+ except Exception as e: # noqa: BLE001
2365
+ print(f"[prewarm] {label} skip — {type(e).__name__}: {e}")
2366
+ # Throttle to stay well under HF/MS anonymous rate limits.
2367
+ time.sleep(2)
2368
+ print("[prewarm] done")
2369
+
2370
+
2371
+ if __name__ == "__main__":
2372
+ if os.environ.get("LLM_CAL_PREWARM", "1") == "1":
2373
+ import threading
2374
+
2375
+ threading.Thread(target=_prewarm_cache, daemon=True).start()
2376
+ _build_ui().launch(theme=THEME, css=CUSTOM_CSS)
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio>=6.0,<7.0
2
+ llm-cal>=0.1.3
src/llm_cal/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """llm-cal — LLM inference hardware calculator."""
2
+
3
+ from llm_cal.core.evaluator import Evaluator
4
+ from llm_cal.output.labels import Label
5
+
6
+ __all__ = ["Evaluator", "Label"]
src/llm_cal/architecture/__init__.py ADDED
File without changes
src/llm_cal/architecture/detector.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """`detect()` — main orchestration over trait sub-detectors.
2
+
3
+ Step 1: Family dispatch (state_space vs transformer vs unknown).
4
+ Step 2: Gather traits (independent sub-detectors).
5
+ Step 3: Assemble Profile with a confidence level.
6
+
7
+ Fallback path: `_fallback_unknown()` for configs missing key fields. This is
8
+ the bedrock of "works on day-0" — new model types degrade gracefully.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from typing import Any
14
+
15
+ from llm_cal.architecture.profile import (
16
+ ArchitectureProfile,
17
+ Confidence,
18
+ Family,
19
+ )
20
+ from llm_cal.architecture.traits import (
21
+ detect_attention,
22
+ detect_moe,
23
+ detect_position,
24
+ detect_sliding_window,
25
+ )
26
+
27
+ # Model types we know we handle well. Maintained alongside engine_compat matrix.
28
+ KNOWN_MODEL_TYPES: frozenset[str] = frozenset(
29
+ {
30
+ "llama",
31
+ "mistral",
32
+ "mixtral",
33
+ "qwen2",
34
+ "qwen2_moe",
35
+ "qwen3",
36
+ "qwen3_moe",
37
+ "deepseek_v2",
38
+ "deepseek_v3",
39
+ "deepseek_v3_2",
40
+ "deepseek_v4",
41
+ "gemma",
42
+ "gemma2",
43
+ "gemma3",
44
+ "phi",
45
+ "phi3",
46
+ }
47
+ )
48
+
49
+ STATE_SPACE_TYPES: frozenset[str] = frozenset({"mamba", "mamba2", "falcon_mamba", "jamba"})
50
+
51
+
52
+ def detect(config: dict[str, Any]) -> ArchitectureProfile:
53
+ """Main entry. Given a parsed config.json dict, return an ArchitectureProfile."""
54
+ model_type = str(config.get("model_type", "")).lower()
55
+
56
+ # Step 1: state_space family short-circuits — v0.1 unsupported, but we identify it
57
+ if model_type in STATE_SPACE_TYPES or "ssm_cfg" in config:
58
+ return ArchitectureProfile(
59
+ model_type=model_type,
60
+ architectures=tuple(str(a).lower() for a in config.get("architectures", [])),
61
+ family=Family.STATE_SPACE,
62
+ num_hidden_layers=int(config.get("num_hidden_layers", 0)),
63
+ hidden_size=int(config.get("hidden_size", 0)),
64
+ vocab_size=int(config.get("vocab_size", 0)),
65
+ confidence=Confidence.HIGH,
66
+ auxiliary={"v0_1_unsupported": True},
67
+ )
68
+
69
+ # Step 2: reject if fundamentally unidentifiable
70
+ if not model_type and not config.get("architectures"):
71
+ return _fallback_unknown(config)
72
+
73
+ # Step 3: required fields
74
+ num_layers = config.get("num_hidden_layers")
75
+ hidden_size = config.get("hidden_size")
76
+ if not num_layers or not hidden_size:
77
+ return _fallback_unknown(config)
78
+
79
+ # Step 4: gather traits (each is independent and may return None)
80
+ attention = detect_attention(config)
81
+ moe = detect_moe(config)
82
+ position = detect_position(config)
83
+ sliding = detect_sliding_window(config)
84
+
85
+ # Step 5: confidence — HIGH iff model_type is in the registry
86
+ confidence = Confidence.HIGH if model_type in KNOWN_MODEL_TYPES else Confidence.MEDIUM
87
+
88
+ # Pass-through of config fields our formulas can use downstream. Keeps the
89
+ # Profile schema stable while enabling richer computation (e.g. dense FFN
90
+ # param count needs intermediate_size).
91
+ auxiliary: dict[str, object] = {}
92
+ if isinstance(config.get("intermediate_size"), int):
93
+ auxiliary["intermediate_size"] = config["intermediate_size"]
94
+ if config.get("tie_word_embeddings") is not None:
95
+ auxiliary["tie_word_embeddings"] = bool(config["tie_word_embeddings"])
96
+
97
+ return ArchitectureProfile(
98
+ model_type=model_type,
99
+ architectures=tuple(str(a).lower() for a in config.get("architectures", [])),
100
+ family=Family.TRANSFORMER,
101
+ num_hidden_layers=int(num_layers),
102
+ hidden_size=int(hidden_size),
103
+ vocab_size=int(config.get("vocab_size", 0)),
104
+ confidence=confidence,
105
+ attention=attention,
106
+ moe=moe,
107
+ position=position,
108
+ sliding_window=sliding,
109
+ auxiliary=auxiliary,
110
+ )
111
+
112
+
113
+ def _fallback_unknown(config: dict[str, Any]) -> ArchitectureProfile:
114
+ """Graceful degradation when config.json is unusable.
115
+
116
+ Still returns a valid Profile. Consumers check `family == Family.UNKNOWN`
117
+ or `confidence == Confidence.LOW` and skip KV-cache estimation accordingly.
118
+ """
119
+ return ArchitectureProfile(
120
+ model_type=str(config.get("model_type", "")).lower(),
121
+ architectures=tuple(str(a).lower() for a in config.get("architectures", [])),
122
+ family=Family.UNKNOWN,
123
+ num_hidden_layers=int(config.get("num_hidden_layers", 0)),
124
+ hidden_size=int(config.get("hidden_size", 0)),
125
+ vocab_size=int(config.get("vocab_size", 0)),
126
+ confidence=Confidence.LOW,
127
+ auxiliary={
128
+ "warning": (
129
+ "No recognizable model_type or missing essential config fields. "
130
+ "Weight estimate from safetensors file size only; "
131
+ "KV cache cannot be estimated; engine compatibility unknown."
132
+ )
133
+ },
134
+ )
src/llm_cal/architecture/formulas/__init__.py ADDED
File without changes
src/llm_cal/architecture/formulas/kv_cache.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """KV cache estimation — traits-composed formula.
2
+
3
+ The formula is NOT owned by a single architecture module. Instead we compose it
4
+ from the traits on `ArchitectureProfile`:
5
+
6
+ baseline = 2 (K+V) * num_kv_heads * head_dim * seq_len * dtype_bytes * num_layers
7
+
8
+ Then apply compositional modifiers:
9
+ * MLA: baseline uses kv_lora_rank instead of num_kv_heads * head_dim
10
+ (DeepSeek's compressed KV representation)
11
+ * CSA_HCA: multiply by an effective-ratio derived from compress_ratios
12
+ (most layers are heavily compressed, a few are dense)
13
+ * Sliding window: cap `seq_len` at the window size
14
+ * NSA: multiply by (nsa_topk / seq_len), clamped — sparse attention
15
+ keeps only top-k keys
16
+
17
+ Returns AnnotatedValue tagged [estimated] unless we can't compute it at all.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ from llm_cal.architecture.profile import (
23
+ ArchitectureProfile,
24
+ AttentionTraits,
25
+ Confidence,
26
+ Family,
27
+ )
28
+ from llm_cal.output.labels import AnnotatedValue, Label
29
+
30
+
31
+ def compute_kv_cache_bytes(
32
+ profile: ArchitectureProfile,
33
+ seq_len: int,
34
+ dtype_bytes: int = 2, # BF16/FP16 default
35
+ ) -> AnnotatedValue[int]:
36
+ """KV cache per single request at `seq_len` tokens.
37
+
38
+ Returns AnnotatedValue. The label tells the user whether we could compute it
39
+ at all.
40
+ """
41
+ if seq_len <= 0:
42
+ return AnnotatedValue(0, Label.ESTIMATED, source="seq_len <= 0")
43
+
44
+ if profile.family == Family.STATE_SPACE:
45
+ return AnnotatedValue(
46
+ 0,
47
+ Label.UNKNOWN,
48
+ source="state-space model has no KV cache concept",
49
+ )
50
+
51
+ if profile.family == Family.UNKNOWN or profile.confidence == Confidence.LOW:
52
+ return AnnotatedValue(
53
+ 0,
54
+ Label.UNKNOWN,
55
+ source="unknown architecture — cannot estimate KV cache",
56
+ )
57
+
58
+ if profile.attention is None or profile.num_hidden_layers <= 0:
59
+ return AnnotatedValue(
60
+ 0,
61
+ Label.UNKNOWN,
62
+ source="missing attention traits or layer count",
63
+ )
64
+
65
+ attn = profile.attention
66
+ n_layers = profile.num_hidden_layers
67
+
68
+ # Step 1: effective seq_len.
69
+ # Sliding window applies ONLY to standard attention (MHA/GQA/MQA). For
70
+ # explicitly-sparse variants (CSA_HCA, NSA), the sparse mechanism already
71
+ # encodes per-layer reduction; stacking sliding cap would double-count and
72
+ # produce absurdly small estimates (measured 1000x too low on DeepSeek-V4).
73
+ effective_seq = seq_len
74
+ sliding_note = ""
75
+ is_sparse_variant = attn.variant in ("CSA_HCA", "NSA")
76
+ if profile.sliding_window and profile.sliding_window > 0 and not is_sparse_variant:
77
+ effective_seq = min(seq_len, profile.sliding_window)
78
+ if effective_seq < seq_len:
79
+ sliding_note = (
80
+ f" (sliding_window={profile.sliding_window} caps {seq_len} -> {effective_seq})"
81
+ )
82
+
83
+ # Step 2: per-layer per-token cache size
84
+ per_layer_per_token = _per_layer_per_token_bytes(attn, dtype_bytes)
85
+
86
+ # Step 3: baseline for the full layer stack
87
+ baseline = per_layer_per_token * effective_seq * n_layers
88
+
89
+ # Step 4: compositional modifier for sparse attention
90
+ result_bytes = baseline
91
+ variant_note: str = str(attn.variant)
92
+
93
+ if attn.variant == "CSA_HCA" and attn.compress_ratios:
94
+ ratio = _average_csa_hca_ratio(attn.compress_ratios)
95
+ result_bytes = int(baseline * ratio)
96
+ variant_note = f"{variant_note} (avg compress ratio {ratio:.3f})"
97
+
98
+ if attn.variant == "NSA" and attn.nsa_topk and attn.nsa_topk > 0:
99
+ sparsity = min(1.0, attn.nsa_topk / effective_seq)
100
+ result_bytes = int(baseline * sparsity)
101
+ variant_note = f"{variant_note} (nsa_topk={attn.nsa_topk}, sparsity={sparsity:.3f})"
102
+
103
+ return AnnotatedValue(
104
+ result_bytes,
105
+ Label.ESTIMATED,
106
+ source=(
107
+ f"{variant_note}: 2*kv_shape*{dtype_bytes}B*{effective_seq}*{n_layers}{sliding_note}"
108
+ ),
109
+ )
110
+
111
+
112
+ def _per_layer_per_token_bytes(attn: AttentionTraits, dtype_bytes: int) -> int:
113
+ """Bytes of K+V storage per token per layer, given attention shape."""
114
+ # MLA: KV is compressed into a single latent vector of size kv_lora_rank.
115
+ # (Both K and V share it; it's NOT 2 * kv_lora_rank.)
116
+ if attn.variant == "MLA" and attn.kv_lora_rank:
117
+ return attn.kv_lora_rank * dtype_bytes
118
+
119
+ # Standard / GQA / MQA / CSA+HCA (the sparse scaling is applied later).
120
+ # K and V both stored: factor of 2.
121
+ return 2 * attn.num_kv_heads * attn.head_dim * dtype_bytes
122
+
123
+
124
+ def _average_csa_hca_ratio(compress_ratios: tuple[int, ...]) -> float:
125
+ """DeepSeek V4 compress_ratios semantics:
126
+
127
+ 0 -> dense attention (keep 100%)
128
+ N>0 -> keep 1/N of tokens
129
+
130
+ Returns the average "keep fraction" across all layers.
131
+
132
+ Example: ratios = [0, 0, 4, 128, 4, 128, ...]
133
+ - two dense layers (fraction = 1.0)
134
+ - remaining alternating 1/4 and 1/128
135
+ - weighted average across all layers
136
+ """
137
+ if not compress_ratios:
138
+ return 1.0
139
+ total_fraction = 0.0
140
+ for r in compress_ratios:
141
+ if r == 0:
142
+ total_fraction += 1.0
143
+ else:
144
+ total_fraction += 1.0 / r
145
+ return total_fraction / len(compress_ratios)
src/llm_cal/architecture/formulas/weight.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Weight count estimation — total parameters and total bytes by assumption.
2
+
3
+ Two distinct purposes, kept separate by label:
4
+ * estimate_total_params(profile) -> [estimated] param count
5
+ * predicted_bytes_under_quant(params, scheme) -> [estimated] bytes
6
+
7
+ The weight_analyzer/reconciler compares predicted_bytes against observed file
8
+ sizes to identify the actual quantization scheme. That's the DeepSeek-V4 story.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from llm_cal.architecture.profile import ArchitectureProfile
14
+ from llm_cal.output.labels import AnnotatedValue, Label
15
+ from llm_cal.weight_analyzer import _QUANT_BPP, QuantizationScheme
16
+
17
+
18
+ def estimate_total_params(profile: ArchitectureProfile) -> AnnotatedValue[int]:
19
+ """Rough param count from Profile.
20
+
21
+ Core components (transformer block):
22
+ - Embedding: vocab_size * hidden_size (+ output head if not tied)
23
+ - Per-layer attention: depends on variant
24
+ - Per-layer FFN: depends on dense vs MoE
25
+
26
+ Returns [estimated] — this is an arithmetic sum over config values, several
27
+ simplifying assumptions (e.g. RMSNorm gamma counted in overhead).
28
+ """
29
+ if profile.num_hidden_layers <= 0 or profile.hidden_size <= 0:
30
+ return AnnotatedValue(0, Label.UNKNOWN, source="insufficient shape info in profile")
31
+
32
+ hidden = profile.hidden_size
33
+ n_layers = profile.num_hidden_layers
34
+ vocab = profile.vocab_size
35
+
36
+ # Embedding + output head. When weights are tied (Gemma, some Llamas),
37
+ # the output head IS the embedding — don't count twice.
38
+ embed_params = vocab * hidden
39
+ tied = bool(profile.auxiliary.get("tie_word_embeddings", False))
40
+ output_head_params = 0 if tied else vocab * hidden
41
+
42
+ # Per-layer attention projections.
43
+ attn_params = _attention_params(profile)
44
+
45
+ # Per-layer FFN (dense path) OR MoE expert block.
46
+ ffn_params = _ffn_params(profile)
47
+
48
+ # Per-layer LayerNorms (2 of them, one scalar per feature).
49
+ norm_params = 2 * hidden
50
+
51
+ per_layer = attn_params + ffn_params + norm_params
52
+ total = embed_params + output_head_params + per_layer * n_layers
53
+
54
+ return AnnotatedValue(
55
+ total,
56
+ Label.ESTIMATED,
57
+ source=(
58
+ f"{vocab} vocab * {hidden} hidden * 2 (embed+head) + "
59
+ f"{n_layers} layers * ({attn_params:,} attn + {ffn_params:,} ffn + norms)"
60
+ ),
61
+ )
62
+
63
+
64
+ def _attention_params(profile: ArchitectureProfile) -> int:
65
+ """Parameter count for attention projections (Q/K/V/O) in one layer."""
66
+ attn = profile.attention
67
+ if attn is None:
68
+ return 0
69
+ hidden = profile.hidden_size
70
+
71
+ # MLA uses low-rank projections — very different shape.
72
+ if attn.variant == "MLA" and attn.q_lora_rank:
73
+ q_lora = attn.q_lora_rank
74
+ kv_lora = attn.kv_lora_rank or attn.q_lora_rank # approximate
75
+ # W_q_down + W_q_up + W_kv_down + W_kv_up + W_o_down + W_o_up
76
+ head_total = attn.num_heads * attn.head_dim
77
+ return (
78
+ hidden * q_lora # Q down
79
+ + q_lora * head_total # Q up
80
+ + hidden * kv_lora * 2 # K+V down (shared)
81
+ + kv_lora * head_total # K+V up
82
+ + head_total * q_lora # O down (reuse q_lora as o_lora approx)
83
+ + q_lora * hidden # O up
84
+ )
85
+
86
+ # Standard/GQA/MQA: Q + K + V + O projections
87
+ q_out = attn.num_heads * attn.head_dim
88
+ kv_out = attn.num_kv_heads * attn.head_dim
89
+ return hidden * q_out + hidden * kv_out * 2 + q_out * hidden
90
+
91
+
92
+ def _ffn_params(profile: ArchitectureProfile) -> int:
93
+ """Parameter count for the FFN (MoE or dense) in one layer.
94
+
95
+ For MoE, counts all experts (routed + shared) because they all live in memory.
96
+ Active parameters per token is a different metric (not our job here).
97
+ """
98
+ hidden = profile.hidden_size
99
+
100
+ if profile.moe is not None:
101
+ moe = profile.moe
102
+ # SwiGLU-style expert: 3 matrices (gate, up, down), each hidden x moe_intermediate.
103
+ single_expert = 3 * hidden * moe.moe_intermediate_size
104
+ total_experts = moe.num_routed_experts + moe.num_shared_experts
105
+ # Router: hidden x num_routed_experts
106
+ router = hidden * moe.num_routed_experts
107
+ return single_expert * total_experts + router
108
+
109
+ # Dense: try to read intermediate_size from auxiliary; fallback to 4 * hidden.
110
+ intermediate = profile.auxiliary.get("intermediate_size")
111
+ if not isinstance(intermediate, int) or intermediate <= 0:
112
+ intermediate = 4 * hidden
113
+ # SwiGLU: 3 matrices
114
+ return 3 * hidden * intermediate
115
+
116
+
117
+ def predicted_bytes_under_quant(
118
+ total_params: int, scheme: QuantizationScheme
119
+ ) -> AnnotatedValue[int]:
120
+ """How many bytes `total_params` would occupy under a given quantization."""
121
+ bpp = _QUANT_BPP.get(scheme, 0.0)
122
+ if bpp == 0.0:
123
+ return AnnotatedValue(
124
+ 0,
125
+ Label.UNKNOWN,
126
+ source=f"no bytes-per-param mapping for {scheme}",
127
+ )
128
+ predicted = int(total_params * bpp)
129
+ return AnnotatedValue(
130
+ predicted,
131
+ Label.ESTIMATED,
132
+ source=f"{total_params:,} params * {bpp} bytes/param ({scheme})",
133
+ )
src/llm_cal/architecture/profile.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ArchitectureProfile — the core data class the whole tool orbits.
2
+
3
+ Key insight: an architecture is NOT a single label. It's a combination of independent
4
+ traits that co-exist on a Profile. DeepSeek-V3.2 = MoE + MLA + NSA — three traits.
5
+ Single-module dispatch cannot express this; traits composition can.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass, field
11
+ from enum import StrEnum
12
+ from typing import Literal
13
+
14
+
15
+ class Family(StrEnum):
16
+ TRANSFORMER = "transformer"
17
+ STATE_SPACE = "state_space" # Mamba, etc. — v0.1 unsupported
18
+ UNKNOWN = "unknown"
19
+
20
+
21
+ class Confidence(StrEnum):
22
+ HIGH = "high" # model_type in KNOWN_MODEL_TYPES, all fields present
23
+ MEDIUM = "medium" # model_type unknown but architectures[] or config partial
24
+ LOW = "low" # fallback path, config.json missing or malformed
25
+
26
+
27
+ AttentionVariant = Literal["MHA", "GQA", "MQA", "MLA", "NSA", "CSA_HCA"]
28
+
29
+
30
+ @dataclass(frozen=True)
31
+ class AttentionTraits:
32
+ """Attention layer shape. Populated by `detect_attention()`."""
33
+
34
+ variant: AttentionVariant
35
+ num_heads: int
36
+ num_kv_heads: int
37
+ head_dim: int
38
+ # MLA-specific (DeepSeek V2+)
39
+ q_lora_rank: int | None = None
40
+ kv_lora_rank: int | None = None
41
+ # Sparse attention (CSA+HCA per DeepSeek V4)
42
+ compress_ratios: tuple[int, ...] | None = None
43
+ # Sparse attention (NSA per DeepSeek V3.2)
44
+ nsa_topk: int | None = None
45
+
46
+
47
+ @dataclass(frozen=True)
48
+ class MoETraits:
49
+ """MoE-specific layer shape. None on Profile means dense."""
50
+
51
+ num_routed_experts: int
52
+ num_shared_experts: int
53
+ num_experts_per_tok: int
54
+ moe_intermediate_size: int
55
+
56
+
57
+ @dataclass(frozen=True)
58
+ class PositionTraits:
59
+ """RoPE / YaRN / AliBi / none."""
60
+
61
+ rope_type: Literal["rope", "yarn", "alibi", "none"] = "rope"
62
+ rope_theta: float | None = None
63
+ rope_scaling_factor: float | None = None
64
+ max_position_embeddings: int | None = None
65
+
66
+
67
+ @dataclass(frozen=True)
68
+ class ArchitectureProfile:
69
+ """Complete architectural snapshot of a model.
70
+
71
+ This drives weight/KV-cache formulas, engine matching, and fleet planning.
72
+ """
73
+
74
+ model_type: str # config.json's `model_type` (lowercase)
75
+ architectures: tuple[str, ...] # config.json's `architectures[]`
76
+ family: Family
77
+ num_hidden_layers: int
78
+ hidden_size: int
79
+ vocab_size: int
80
+ confidence: Confidence
81
+ # Traits (composable — not all populated)
82
+ attention: AttentionTraits | None = None
83
+ moe: MoETraits | None = None
84
+ position: PositionTraits | None = None
85
+ sliding_window: int | None = None # None = no window
86
+ # Pass-through for traits we haven't categorised yet
87
+ auxiliary: dict[str, object] = field(default_factory=dict)
88
+
89
+ @property
90
+ def is_moe(self) -> bool:
91
+ return self.moe is not None
92
+
93
+ @property
94
+ def is_sparse_attention(self) -> bool:
95
+ if self.attention is None:
96
+ return False
97
+ return self.attention.variant in ("NSA", "CSA_HCA")
src/llm_cal/architecture/traits.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Independent trait sub-detectors.
2
+
3
+ Each function inspects config.json and returns a trait dataclass (or None).
4
+ They co-exist: a MoE+MLA+CSA_HCA model matches all three.
5
+
6
+ Dispatch order inside `detect_attention()` is critical because some keys are
7
+ ambiguous (e.g. num_kv_heads < num_heads can be GQA OR a side-effect of MLA
8
+ where there's a single compressed KV head).
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from typing import Any
14
+
15
+ from llm_cal.architecture.profile import (
16
+ AttentionTraits,
17
+ MoETraits,
18
+ PositionTraits,
19
+ )
20
+
21
+
22
+ def detect_moe(config: dict[str, Any]) -> MoETraits | None:
23
+ """MoE detection — presence of any routed-expert key signals MoE."""
24
+ routed = (
25
+ config.get("n_routed_experts")
26
+ or config.get("num_local_experts")
27
+ or config.get("num_experts")
28
+ )
29
+ if not routed:
30
+ return None
31
+
32
+ return MoETraits(
33
+ num_routed_experts=int(routed),
34
+ num_shared_experts=int(config.get("n_shared_experts", 0)),
35
+ num_experts_per_tok=int(
36
+ config.get("num_experts_per_tok") or config.get("num_experts_per_token", 1)
37
+ ),
38
+ moe_intermediate_size=int(
39
+ config.get("moe_intermediate_size") or config.get("intermediate_size", 0)
40
+ ),
41
+ )
42
+
43
+
44
+ def detect_attention(config: dict[str, Any]) -> AttentionTraits:
45
+ """Attention variant detection — order-sensitive.
46
+
47
+ Priority (first match wins on variant, but shape fields always populated):
48
+ 1. CSA+HCA: compress_ratios array, length matches num_hidden_layers
49
+ 2. NSA: nsa_config / sparse_attention_cfg present
50
+ 3. MLA: q_lora_rank OR kv_lora_rank present
51
+ 4. GQA/MQA: num_kv_heads < num_heads
52
+ 5. MHA: default
53
+ """
54
+ num_heads = int(config.get("num_attention_heads", 1))
55
+ num_kv_heads = int(config.get("num_key_value_heads", num_heads))
56
+ head_dim = int(config.get("head_dim") or (config.get("hidden_size", 0) // num_heads or 1))
57
+ num_layers = int(config.get("num_hidden_layers", 0))
58
+
59
+ q_lora = config.get("q_lora_rank")
60
+ kv_lora = config.get("kv_lora_rank")
61
+ compress_ratios = config.get("compress_ratios")
62
+ has_nsa = "nsa_config" in config or "sparse_attention_cfg" in config
63
+
64
+ # CSA+HCA: length check guards against future variants that happen to use the
65
+ # same key name with different semantics. Reviewer flagged this.
66
+ # Accepted lengths:
67
+ # - num_hidden_layers
68
+ # - num_hidden_layers + num_nextn_predict_layers (DeepSeek MTP: one extra
69
+ # ratio for the next-token prediction head)
70
+ nextn = int(config.get("num_nextn_predict_layers", 0))
71
+ accepted_lengths = {num_layers, num_layers + nextn} if num_layers > 0 else set()
72
+ if (
73
+ isinstance(compress_ratios, list)
74
+ and num_layers > 0
75
+ and len(compress_ratios) in accepted_lengths
76
+ ):
77
+ return AttentionTraits(
78
+ variant="CSA_HCA",
79
+ num_heads=num_heads,
80
+ num_kv_heads=num_kv_heads,
81
+ head_dim=head_dim,
82
+ q_lora_rank=int(q_lora) if q_lora else None,
83
+ kv_lora_rank=int(kv_lora) if kv_lora else None,
84
+ compress_ratios=tuple(compress_ratios),
85
+ )
86
+
87
+ if has_nsa:
88
+ nsa_cfg = config.get("nsa_config") or config.get("sparse_attention_cfg", {})
89
+ nsa_topk = None
90
+ if isinstance(nsa_cfg, dict):
91
+ nsa_topk = nsa_cfg.get("topk") or nsa_cfg.get("index_topk")
92
+ return AttentionTraits(
93
+ variant="NSA",
94
+ num_heads=num_heads,
95
+ num_kv_heads=num_kv_heads,
96
+ head_dim=head_dim,
97
+ nsa_topk=int(nsa_topk) if nsa_topk else None,
98
+ )
99
+
100
+ if q_lora or kv_lora:
101
+ return AttentionTraits(
102
+ variant="MLA",
103
+ num_heads=num_heads,
104
+ num_kv_heads=num_kv_heads,
105
+ head_dim=head_dim,
106
+ q_lora_rank=int(q_lora) if q_lora else None,
107
+ kv_lora_rank=int(kv_lora) if kv_lora else None,
108
+ )
109
+
110
+ if num_kv_heads < num_heads:
111
+ variant = "MQA" if num_kv_heads == 1 else "GQA"
112
+ return AttentionTraits(
113
+ variant=variant, # type: ignore[arg-type]
114
+ num_heads=num_heads,
115
+ num_kv_heads=num_kv_heads,
116
+ head_dim=head_dim,
117
+ )
118
+
119
+ return AttentionTraits(
120
+ variant="MHA",
121
+ num_heads=num_heads,
122
+ num_kv_heads=num_kv_heads,
123
+ head_dim=head_dim,
124
+ )
125
+
126
+
127
+ def detect_position(config: dict[str, Any]) -> PositionTraits:
128
+ rope_scaling = config.get("rope_scaling") or {}
129
+ rope_type = (rope_scaling.get("type") or rope_scaling.get("rope_type") or "rope").lower()
130
+ if rope_type not in ("rope", "yarn", "alibi", "none"):
131
+ rope_type = "rope"
132
+
133
+ return PositionTraits(
134
+ rope_type=rope_type, # type: ignore[arg-type]
135
+ rope_theta=float(config["rope_theta"]) if config.get("rope_theta") else None,
136
+ rope_scaling_factor=(float(rope_scaling["factor"]) if rope_scaling.get("factor") else None),
137
+ max_position_embeddings=(
138
+ int(config["max_position_embeddings"])
139
+ if config.get("max_position_embeddings")
140
+ else None
141
+ ),
142
+ )
143
+
144
+
145
+ def detect_sliding_window(config: dict[str, Any]) -> int | None:
146
+ """Return window size if sliding-window attention is used, else None."""
147
+ sw = config.get("sliding_window")
148
+ if sw is None or sw == 0:
149
+ return None
150
+ return int(sw)
src/llm_cal/benchmark/__init__.py ADDED
File without changes
src/llm_cal/benchmark/dataset.yaml ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Reference benchmark dataset — curated anchor points for validating llm-cal
2
+ # output against publicly-known values.
3
+ #
4
+ # This is NOT a synthetic benchmark. Each entry cites where the expected
5
+ # values came from — HF API, model card text, vLLM/SGLang recipe, or
6
+ # hand computation in the design doc. If you add an entry, cite sources.
7
+ #
8
+ # The runner (`llm-cal benchmark`) fetches each model's live config and
9
+ # compares the tool's output against these expectations. Failures mean
10
+ # either the tool drifted or the reference data is stale.
11
+ schema_version: 1
12
+ entries:
13
+ # ------------------------------------------------------------
14
+ # Signature case — DeepSeek-V4-Flash. Every claim here is the
15
+ # reason this tool exists (vs gpu_poor's naive FP8 assumption).
16
+ # ------------------------------------------------------------
17
+ - name: "DeepSeek-V4-Flash on 8x H800 (tool's reference case)"
18
+ model_id: deepseek-ai/DeepSeek-V4-Flash
19
+ gpu: H800
20
+ engine: vllm
21
+ expectations:
22
+ - field: attention_variant
23
+ expected: CSA_HCA
24
+ source: "config.json compress_ratios length=44 matches n_layers+n_mtp"
25
+ - field: quantization
26
+ expected: FP4_FP8_MIXED
27
+ source: "HF model card: 'FP4 + FP8 Mixed: MoE experts FP4, others FP8'"
28
+ - field: weight_bytes
29
+ expected_min: 158_000_000_000
30
+ expected_max: 162_000_000_000
31
+ source: "HF siblings API (46x ~3.57 GB safetensors shards ≈ 160 GB)"
32
+ - field: fleet_prod_gpus
33
+ expected: 8
34
+ source: "Design doc hand computation: 8x H800 for prod-scale concurrency"
35
+ - field: is_moe
36
+ expected: true
37
+ source: "config.json n_routed_experts=256"
38
+
39
+ # ------------------------------------------------------------
40
+ # Dense GQA — Qwen2.5-72B. Validates:
41
+ # - dense (no MoE) detection
42
+ # - BF16/FP16 quantization path
43
+ # - GQA KV sharding math (critical for Llama-family models)
44
+ # ------------------------------------------------------------
45
+ - name: "Qwen2.5-72B on 8x H100 (GQA reference)"
46
+ model_id: Qwen/Qwen2.5-72B-Instruct
47
+ gpu: H100
48
+ engine: vllm
49
+ expectations:
50
+ - field: attention_variant
51
+ expected: GQA
52
+ source: "config.json num_kv_heads=8 < num_attention_heads=64"
53
+ - field: quantization
54
+ expected: FP16
55
+ source: "config.json torch_dtype=bfloat16, no quantization_config"
56
+ - field: weight_bytes
57
+ expected_min: 140_000_000_000
58
+ expected_max: 150_000_000_000
59
+ source: "HF siblings API — 72.7B params × 2 bytes ≈ 145 GB"
60
+ - field: is_moe
61
+ expected: false
62
+ source: "config.json has no n_routed_experts / num_local_experts"
63
+ - field: fleet_prod_gpus_at_most
64
+ expected: 8
65
+ source: "Weights fit on 8x H100 (145 GB / 8 ≈ 18 GB per GPU)"
66
+
67
+ # ------------------------------------------------------------
68
+ # DeepSeek-V3 (classic MoE + MLA, not V3.2's NSA) — validates MLA detection
69
+ # ------------------------------------------------------------
70
+ - name: "DeepSeek-V3 on H800 (MoE+MLA, no sparse attention)"
71
+ model_id: deepseek-ai/DeepSeek-V3
72
+ gpu: H800
73
+ engine: vllm
74
+ expectations:
75
+ - field: attention_variant
76
+ expected: MLA
77
+ source: "config.json q_lora_rank=1536, no compress_ratios or nsa_config"
78
+ - field: is_moe
79
+ expected: true
80
+ source: "config.json n_routed_experts=256"
81
+ - field: quantization
82
+ expected: FP8
83
+ source: "config.json quantization_config.quant_method=fp8"
84
+ - field: weight_bytes
85
+ expected_min: 680_000_000_000
86
+ expected_max: 700_000_000_000
87
+ source: "HF siblings API — 671B params × 1 byte (FP8) ≈ 670 GB"
88
+
89
+ # ------------------------------------------------------------
90
+ # Mixtral 8x7B — dense-MoE variant, non-MLA
91
+ # ------------------------------------------------------------
92
+ - name: "Mixtral 8x7B on 4x H100 (standard MoE, no MLA)"
93
+ model_id: mistralai/Mixtral-8x7B-v0.1
94
+ gpu: H100
95
+ engine: vllm
96
+ expectations:
97
+ - field: attention_variant
98
+ expected: GQA
99
+ source: "config.json num_kv_heads=8 < num_attention_heads=32"
100
+ - field: is_moe
101
+ expected: true
102
+ source: "config.json num_local_experts=8"
103
+ - field: quantization
104
+ expected: FP16
105
+ source: "config.json torch_dtype=bfloat16, no quantization_config"
106
+ - field: weight_bytes
107
+ expected_min: 90_000_000_000
108
+ expected_max: 100_000_000_000
109
+ source: "HF siblings API — 46.7B total params × 2 bytes ≈ 93 GB"
110
+
111
+ # ------------------------------------------------------------
112
+ # DeepSeek-V3.2 — MLA structurally (NSA at runtime). Validates:
113
+ # - model_type=deepseek_v32 is recognized
114
+ # - FP8 quantization (inherited from V3)
115
+ # - Tool honestly reports MLA because config.json exposes only MLA
116
+ # keys; runtime NSA behavior is NOT in config. Future detection
117
+ # improvement could override based on model_type.
118
+ # ------------------------------------------------------------
119
+ - name: "DeepSeek-V3.2 on H800 (MLA config; NSA runtime)"
120
+ model_id: deepseek-ai/DeepSeek-V3.2
121
+ gpu: H800
122
+ engine: vllm
123
+ expectations:
124
+ - field: attention_variant
125
+ expected: MLA
126
+ source: >-
127
+ config.json q_lora_rank=1536, no nsa_config key — detector
128
+ correctly reports MLA. V3.2's NSA sparse behavior is a runtime
129
+ feature selected by vllm --attention-backend nsa, NOT encoded
130
+ in config.json keys. TODO: detector could upgrade to NSA when
131
+ model_type matches known NSA models.
132
+ - field: is_moe
133
+ expected: true
134
+ source: "config.json n_routed_experts=256"
135
+ - field: quantization
136
+ expected: FP8
137
+ source: "config.json quantization_config.quant_method=fp8"
138
+
139
+ # ------------------------------------------------------------
140
+ # Qwen3-30B-A3B — validates qwen3_moe model_type + GQA+MoE combo
141
+ # ------------------------------------------------------------
142
+ - name: "Qwen3-30B-A3B on H100 (Qwen3 MoE, GQA)"
143
+ model_id: Qwen/Qwen3-30B-A3B
144
+ gpu: H100
145
+ engine: vllm
146
+ expectations:
147
+ - field: attention_variant
148
+ expected: GQA
149
+ source: "config.json num_kv_heads=4 < num_attention_heads=32"
150
+ - field: is_moe
151
+ expected: true
152
+ source: "config.json num_local_experts or similar MoE key present"
153
+ - field: quantization
154
+ expected: FP16
155
+ source: "config.json torch_dtype=bfloat16"
156
+ - field: weight_bytes
157
+ expected_min: 58_000_000_000
158
+ expected_max: 65_000_000_000
159
+ source: "HF siblings API — 30.5B total params × 2 bytes ≈ 61 GB"
160
+
161
+ # ------------------------------------------------------------
162
+ # Qwen2.5-7B — small-model sanity + qwen2 model_type
163
+ # ------------------------------------------------------------
164
+ - name: "Qwen2.5-7B on H100 (small dense, sanity)"
165
+ model_id: Qwen/Qwen2.5-7B-Instruct
166
+ gpu: H100
167
+ engine: vllm
168
+ expectations:
169
+ - field: attention_variant
170
+ expected: GQA
171
+ source: "config.json num_kv_heads=4 < num_attention_heads=28"
172
+ - field: is_moe
173
+ expected: false
174
+ source: "config.json has no MoE keys"
175
+ - field: quantization
176
+ expected: FP16
177
+ source: "config.json torch_dtype=bfloat16"
178
+ - field: weight_bytes
179
+ expected_min: 14_000_000_000
180
+ expected_max: 16_000_000_000
181
+ source: "HF siblings API — 7.6B params × 2 bytes ≈ 15.2 GB"
182
+
183
+ # ------------------------------------------------------------
184
+ # Phi-4 — validates phi3 model_type + dense 14B
185
+ # ------------------------------------------------------------
186
+ - name: "Phi-4 on L40S (phi3 architecture, 14B dense)"
187
+ model_id: microsoft/Phi-4
188
+ gpu: L40S
189
+ engine: vllm
190
+ expectations:
191
+ - field: attention_variant
192
+ expected: GQA
193
+ source: "config.json num_kv_heads=10 < num_attention_heads=40"
194
+ - field: is_moe
195
+ expected: false
196
+ source: "config.json has no MoE keys"
197
+ - field: quantization
198
+ expected: FP16
199
+ source: "config.json torch_dtype=bfloat16"
200
+ - field: weight_bytes
201
+ expected_min: 28_000_000_000
202
+ expected_max: 31_000_000_000
203
+ source: "HF siblings API — 14.7B params × 2 bytes ≈ 29.3 GB"
src/llm_cal/benchmark/runner.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Benchmark runner — validate llm-cal's output against curated references.
2
+
3
+ For each entry in dataset.yaml, run the evaluator against the model, then
4
+ compare each `expectations[]` field with the predicted value. Report a
5
+ table of pass/fail per check, plus a summary.
6
+
7
+ This is NOT a synthetic benchmark. Every expected value cites a source
8
+ (HF API, model card text, vLLM recipe, hand computation) so users can
9
+ audit.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from dataclasses import dataclass
15
+ from functools import lru_cache
16
+ from importlib.resources import files
17
+ from pathlib import Path
18
+ from typing import Literal
19
+
20
+ from pydantic import BaseModel, Field
21
+ from rich.console import Console
22
+ from rich.table import Table
23
+
24
+ from llm_cal.common.yaml_loader import load_yaml
25
+ from llm_cal.core.evaluator import EvaluationReport, Evaluator
26
+
27
+ Status = Literal["PASS", "FAIL", "SKIP"]
28
+
29
+
30
+ class Expectation(BaseModel):
31
+ field: str
32
+ # Exactly one of these is used depending on `field`
33
+ expected: str | int | bool | None = None
34
+ expected_min: int | None = None
35
+ expected_max: int | None = None
36
+ source: str
37
+
38
+
39
+ class BenchmarkEntry(BaseModel):
40
+ name: str
41
+ model_id: str
42
+ gpu: str
43
+ engine: str = "vllm"
44
+ expectations: list[Expectation] = Field(default_factory=list)
45
+
46
+
47
+ class BenchmarkDataset(BaseModel):
48
+ schema_version: int
49
+ entries: list[BenchmarkEntry]
50
+
51
+
52
+ @dataclass(frozen=True)
53
+ class CheckResult:
54
+ entry_name: str
55
+ field: str
56
+ status: Status
57
+ predicted: str
58
+ expected: str
59
+ source: str
60
+ note: str | None = None
61
+
62
+
63
+ def _default_dataset_path() -> Path:
64
+ return Path(str(files("llm_cal.benchmark").joinpath("dataset.yaml")))
65
+
66
+
67
+ @lru_cache(maxsize=1)
68
+ def load_dataset(path: Path | None = None) -> BenchmarkDataset:
69
+ return load_yaml(path or _default_dataset_path(), BenchmarkDataset)
70
+
71
+
72
+ def run_all(
73
+ evaluator: Evaluator | None = None,
74
+ dataset: BenchmarkDataset | None = None,
75
+ ) -> list[CheckResult]:
76
+ """Run every check in the dataset. Returns flat list of results."""
77
+ evaluator = evaluator or Evaluator()
78
+ dataset = dataset or load_dataset()
79
+ results: list[CheckResult] = []
80
+ for entry in dataset.entries:
81
+ try:
82
+ report = evaluator.evaluate(
83
+ model_id=entry.model_id,
84
+ gpu=entry.gpu,
85
+ engine=entry.engine,
86
+ )
87
+ except Exception as e:
88
+ for exp in entry.expectations:
89
+ results.append(
90
+ CheckResult(
91
+ entry_name=entry.name,
92
+ field=exp.field,
93
+ status="SKIP",
94
+ predicted="(evaluation failed)",
95
+ expected=_fmt_expected(exp),
96
+ source=exp.source,
97
+ note=f"{type(e).__name__}: {e}",
98
+ )
99
+ )
100
+ continue
101
+ for exp in entry.expectations:
102
+ results.append(_check_one(entry.name, report, exp))
103
+ return results
104
+
105
+
106
+ def _check_one(entry_name: str, report: EvaluationReport, exp: Expectation) -> CheckResult:
107
+ predicted_str, status = _evaluate_field(report, exp)
108
+ return CheckResult(
109
+ entry_name=entry_name,
110
+ field=exp.field,
111
+ status=status,
112
+ predicted=predicted_str,
113
+ expected=_fmt_expected(exp),
114
+ source=exp.source,
115
+ )
116
+
117
+
118
+ def _evaluate_field(report: EvaluationReport, exp: Expectation) -> tuple[str, Status]:
119
+ """Return (predicted_str, PASS/FAIL/SKIP) for this field.
120
+
121
+ Each `field` name matches a documented check in dataset.yaml.
122
+ """
123
+ if exp.field == "attention_variant":
124
+ attn_actual = report.profile.attention.variant if report.profile.attention else "(none)"
125
+ return attn_actual, ("PASS" if attn_actual == exp.expected else "FAIL")
126
+
127
+ if exp.field == "quantization":
128
+ quant_actual = report.weight.quantization_guess.value
129
+ return quant_actual, ("PASS" if quant_actual == exp.expected else "FAIL")
130
+
131
+ if exp.field == "is_moe":
132
+ actual_bool = report.profile.is_moe
133
+ return str(actual_bool), ("PASS" if actual_bool == exp.expected else "FAIL")
134
+
135
+ if exp.field == "weight_bytes":
136
+ actual_int = report.weight.total_bytes.value
137
+ low = exp.expected_min or 0
138
+ high = exp.expected_max or (1 << 62)
139
+ passed = low <= actual_int <= high
140
+ return f"{actual_int:,}", ("PASS" if passed else "FAIL")
141
+
142
+ if exp.field == "fleet_prod_gpus":
143
+ if report.fleet is None:
144
+ return "(no fleet)", "SKIP"
145
+ prod = next((o for o in report.fleet.options if o.tier == "prod"), None)
146
+ if prod is None:
147
+ return "(no prod tier)", "SKIP"
148
+ passed = prod.gpu_count == exp.expected
149
+ return str(prod.gpu_count), ("PASS" if passed else "FAIL")
150
+
151
+ if exp.field == "fleet_prod_gpus_at_most":
152
+ if report.fleet is None:
153
+ return "(no fleet)", "SKIP"
154
+ prod = next((o for o in report.fleet.options if o.tier == "prod"), None)
155
+ if prod is None:
156
+ return "(no prod tier)", "SKIP"
157
+ passed = prod.gpu_count <= int(exp.expected or 0)
158
+ return f"{prod.gpu_count} (max {exp.expected})", ("PASS" if passed else "FAIL")
159
+
160
+ return "(unknown field)", "SKIP"
161
+
162
+
163
+ def _fmt_expected(exp: Expectation) -> str:
164
+ if exp.expected is not None:
165
+ return str(exp.expected)
166
+ if exp.expected_min is not None or exp.expected_max is not None:
167
+ lo = f"{exp.expected_min:,}" if exp.expected_min is not None else "-∞"
168
+ hi = f"{exp.expected_max:,}" if exp.expected_max is not None else "+∞"
169
+ return f"[{lo}, {hi}]"
170
+ return "(unspecified)"
171
+
172
+
173
+ def render_results(results: list[CheckResult], console: Console | None = None) -> None:
174
+ console = console or Console()
175
+
176
+ table = Table(
177
+ title="Benchmark results",
178
+ title_justify="left",
179
+ show_header=True,
180
+ header_style="dim",
181
+ box=None,
182
+ padding=(0, 2),
183
+ )
184
+ table.add_column("entry")
185
+ table.add_column("field")
186
+ table.add_column("predicted")
187
+ table.add_column("expected")
188
+ table.add_column("status")
189
+
190
+ status_styles = {
191
+ "PASS": "bold green",
192
+ "FAIL": "bold red",
193
+ "SKIP": "dim yellow",
194
+ }
195
+
196
+ current_entry = None
197
+ for r in results:
198
+ entry_cell = r.entry_name if r.entry_name != current_entry else ""
199
+ current_entry = r.entry_name
200
+ table.add_row(
201
+ entry_cell,
202
+ r.field,
203
+ r.predicted,
204
+ r.expected,
205
+ f"[{status_styles[r.status]}]{r.status}[/]",
206
+ )
207
+
208
+ console.print(table)
209
+
210
+ total = len(results)
211
+ passed = sum(1 for r in results if r.status == "PASS")
212
+ failed = sum(1 for r in results if r.status == "FAIL")
213
+ skipped = sum(1 for r in results if r.status == "SKIP")
214
+
215
+ summary = (
216
+ f"Total: {total} "
217
+ f"[bold green]PASS: {passed}[/] "
218
+ f"[bold red]FAIL: {failed}[/] "
219
+ f"[dim yellow]SKIP: {skipped}[/]"
220
+ )
221
+ console.print(summary)
222
+
223
+ if failed > 0:
224
+ console.print(
225
+ "[dim]Failures show the tool's prediction diverges from a curated "
226
+ "source. Check the `source` column for the expected-value provenance.[/]"
227
+ )
228
+
229
+
230
+ def exit_code_from(results: list[CheckResult]) -> int:
231
+ """0 if all PASS or only SKIP; 1 if any FAIL."""
232
+ return 1 if any(r.status == "FAIL" for r in results) else 0
src/llm_cal/cli.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """CLI entry point. Thin shell over `Evaluator` + rich formatter."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+
7
+ import typer
8
+ from rich.console import Console
9
+
10
+ from llm_cal.benchmark.runner import exit_code_from, render_results, run_all
11
+ from llm_cal.common.i18n import detect_locale_from_env, get_locale, set_locale, t
12
+ from llm_cal.core.evaluator import Evaluator
13
+ from llm_cal.core.explain import build as build_explain
14
+ from llm_cal.hardware.loader import load_database
15
+ from llm_cal.llm_review.reviewer import run_review
16
+ from llm_cal.model_source.base import (
17
+ AuthRequiredError,
18
+ ModelNotFoundError,
19
+ ModelSource,
20
+ SourceUnavailableError,
21
+ )
22
+ from llm_cal.model_source.huggingface import HuggingFaceSource
23
+ from llm_cal.model_source.modelscope import ModelScopeSource
24
+ from llm_cal.output.formatter import (
25
+ render,
26
+ render_explain,
27
+ render_gpu_list,
28
+ render_llm_review,
29
+ )
30
+
31
+ # Set locale from env first; --lang flag can override inside main()
32
+ set_locale(detect_locale_from_env())
33
+
34
+ app = typer.Typer(
35
+ name="llm-cal",
36
+ help="LLM inference hardware calculator.",
37
+ no_args_is_help=True,
38
+ )
39
+ _console = Console()
40
+ _err = Console(stderr=True)
41
+
42
+
43
+ @app.command()
44
+ def main(
45
+ model_id: str | None = typer.Argument(None, help="HuggingFace or ModelScope model id"),
46
+ gpu: str | None = typer.Option(None, "--gpu", help="GPU type, e.g. H800, A100-80G"),
47
+ engine: str = typer.Option("vllm", "--engine", help="Inference engine: vllm | sglang"),
48
+ gpu_count: int | None = typer.Option(
49
+ None, "--gpu-count", help="Force GPU count (otherwise tool recommends)"
50
+ ),
51
+ context_length: int | None = typer.Option(
52
+ None, "--context-length", help="Context length for KV cache estimation"
53
+ ),
54
+ refresh: bool = typer.Option(False, "--refresh", help="Bypass cache and re-fetch"),
55
+ lang: str | None = typer.Option(
56
+ None,
57
+ "--lang",
58
+ help="Output language: en | zh (default auto-detects from LANG env)",
59
+ ),
60
+ list_gpus: bool = typer.Option(
61
+ False,
62
+ "--list-gpus",
63
+ help="List all supported GPUs and exit (no model_id needed)",
64
+ ),
65
+ benchmark: bool = typer.Option(
66
+ False,
67
+ "--benchmark",
68
+ help=(
69
+ "Run the curated benchmark dataset: compare tool output against "
70
+ "reference values from HF API, model cards, vLLM recipes. "
71
+ "Requires network. Exit 0 on all-pass, 1 if any FAIL."
72
+ ),
73
+ ),
74
+ input_tokens: int = typer.Option(
75
+ 2000,
76
+ "--input-tokens",
77
+ help="Input token budget for prefill-latency estimation (default: 2000).",
78
+ ),
79
+ output_tokens: int = typer.Option(
80
+ 512,
81
+ "--output-tokens",
82
+ help="Output token budget for total-latency math (default: 512).",
83
+ ),
84
+ target_tokens_per_sec: float = typer.Option(
85
+ 30.0,
86
+ "--target-tokens-per-sec",
87
+ help="SLA: per-user decode tokens/second (drives L bound). Default: 30.",
88
+ ),
89
+ prefill_util: float = typer.Option(
90
+ 0.40,
91
+ "--prefill-util",
92
+ help="Compute utilization factor for prefill (empirical, default 0.40).",
93
+ ),
94
+ decode_bw_util: float = typer.Option(
95
+ 0.50,
96
+ "--decode-bw-util",
97
+ help="Memory-bandwidth utilization factor for decode (default 0.50).",
98
+ ),
99
+ concurrency_degradation: float = typer.Option(
100
+ 1.0,
101
+ "--concurrency-degradation",
102
+ help=(
103
+ "High-concurrency throughput degradation factor (default 1.0 = "
104
+ "no degradation — the honest baseline). If your engine drops "
105
+ "to 60% efficiency under load, pass 1.67. See docs/methodology.md."
106
+ ),
107
+ ),
108
+ explain: bool = typer.Option(
109
+ False,
110
+ "--explain",
111
+ help=(
112
+ "Print the full derivation trace (formula, inputs, step-by-step, "
113
+ "source) for every non-trivial number. Feed the output to an LLM "
114
+ "if you want a second opinion on the math."
115
+ ),
116
+ ),
117
+ llm_review: bool = typer.Option(
118
+ False,
119
+ "--llm-review",
120
+ help=(
121
+ "EXPERIMENTAL: send the derivation trace to an LLM for a second "
122
+ "opinion. Output is tagged [llm-opinion] and never overrides the "
123
+ "6 primary labels. Requires env vars: LLM_CAL_REVIEWER_API_KEY "
124
+ "(required), LLM_CAL_REVIEWER_BASE_URL (default OpenAI), "
125
+ "LLM_CAL_REVIEWER_MODEL (default gpt-4o)."
126
+ ),
127
+ ),
128
+ source: str = typer.Option(
129
+ "huggingface",
130
+ "--source",
131
+ help=(
132
+ "Model source: huggingface (default) | modelscope. "
133
+ "Auth via HF_TOKEN or MODELSCOPE_API_TOKEN env var."
134
+ ),
135
+ ),
136
+ ) -> None:
137
+ """Evaluate a model against target hardware."""
138
+ if lang in ("en", "zh"):
139
+ set_locale(lang) # type: ignore[arg-type]
140
+
141
+ # Meta commands short-circuit before requiring model_id + --gpu.
142
+ if list_gpus:
143
+ render_gpu_list(load_database(), _console)
144
+ return
145
+
146
+ if benchmark:
147
+ results = run_all()
148
+ render_results(results, _console)
149
+ sys.exit(exit_code_from(results))
150
+
151
+ if not model_id:
152
+ _err.print("[red]Missing argument MODEL_ID. Use --help for usage.[/red]")
153
+ raise typer.Exit(code=1)
154
+ if not gpu:
155
+ _err.print("[red]Missing option --gpu. Use --list-gpus to see choices.[/red]")
156
+ raise typer.Exit(code=1)
157
+
158
+ src_obj: ModelSource
159
+ src_lower = source.lower()
160
+ if src_lower in ("hf", "huggingface"):
161
+ src_obj = HuggingFaceSource()
162
+ elif src_lower in ("ms", "modelscope"):
163
+ src_obj = ModelScopeSource()
164
+ else:
165
+ _err.print(
166
+ f"[red]Unknown --source '{source}'. Use 'huggingface' or 'modelscope'.[/red]"
167
+ )
168
+ raise typer.Exit(code=1)
169
+
170
+ evaluator = Evaluator(source=src_obj)
171
+ try:
172
+ report = evaluator.evaluate(
173
+ model_id=model_id,
174
+ gpu=gpu,
175
+ engine=engine,
176
+ gpu_count=gpu_count,
177
+ context_length=context_length,
178
+ refresh=refresh,
179
+ input_tokens=input_tokens,
180
+ output_tokens=output_tokens,
181
+ target_tokens_per_sec=target_tokens_per_sec,
182
+ prefill_utilization=prefill_util,
183
+ decode_bw_utilization=decode_bw_util,
184
+ concurrency_degradation=concurrency_degradation,
185
+ )
186
+ except AuthRequiredError as e:
187
+ _err.print(f"[bold red]{t('cli.err.auth_required')}[/bold red] {e}")
188
+ sys.exit(2)
189
+ except ModelNotFoundError as e:
190
+ _err.print(f"[bold red]{t('cli.err.model_not_found')}[/bold red] {e}")
191
+ sys.exit(3)
192
+ except SourceUnavailableError as e:
193
+ _err.print(f"[bold red]{t('cli.err.source_unavailable')}[/bold red] {e}")
194
+ sys.exit(4)
195
+
196
+ render(report, _console)
197
+ explain_entries = build_explain(report) if (explain or llm_review) else []
198
+ if explain:
199
+ render_explain(explain_entries, _console)
200
+ if llm_review:
201
+ # Locale at this point has been resolved by set_locale() calls above.
202
+ result = run_review(explain_entries, locale=get_locale())
203
+ render_llm_review(result, _console)
204
+
205
+
206
+ if __name__ == "__main__":
207
+ app()
src/llm_cal/command_generator/__init__.py ADDED
File without changes
src/llm_cal/command_generator/sglang.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Generate a ready-to-copy SGLang launch command."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from llm_cal.architecture.profile import ArchitectureProfile
6
+ from llm_cal.engine_compat.loader import EngineCompatEntry
7
+
8
+
9
+ def generate_sglang_command(
10
+ model_id: str,
11
+ profile: ArchitectureProfile,
12
+ tensor_parallel_size: int,
13
+ entry: EngineCompatEntry | None,
14
+ max_model_len: int | None = None,
15
+ ) -> str:
16
+ """Generate a multi-line `python -m sglang.launch_server ...` command string."""
17
+ lines: list[str] = [
18
+ "python -m sglang.launch_server",
19
+ f" --model-path {model_id}",
20
+ f" --tp {tensor_parallel_size}",
21
+ ]
22
+
23
+ effective_max = max_model_len
24
+ if effective_max is None and profile.position is not None:
25
+ effective_max = profile.position.max_position_embeddings
26
+ if effective_max:
27
+ lines.append(f" --context-length {effective_max}")
28
+
29
+ if _needs_trust_remote_code(profile.model_type):
30
+ lines.append(" --trust-remote-code")
31
+
32
+ lines.append(" --mem-fraction-static 0.9")
33
+
34
+ if entry is not None:
35
+ for flag in entry.required_flags:
36
+ lines.append(" " + _render_flag(flag.flag, flag.value))
37
+ for flag in entry.optional_flags:
38
+ lines.append(" " + _render_flag(flag.flag, flag.value))
39
+
40
+ return " \\\n".join(lines)
41
+
42
+
43
+ def _render_flag(flag: str, value: str | None) -> str:
44
+ if value is None:
45
+ return flag
46
+ return f"{flag} {value}"
47
+
48
+
49
+ def _needs_trust_remote_code(model_type: str) -> bool:
50
+ return model_type.startswith(("deepseek", "qwen2_moe", "qwen3_moe", "mixtral"))
src/llm_cal/command_generator/vllm.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Generate a ready-to-copy vllm serve command."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from llm_cal.architecture.profile import ArchitectureProfile
6
+ from llm_cal.engine_compat.loader import EngineCompatEntry
7
+
8
+
9
+ def generate_vllm_command(
10
+ model_id: str,
11
+ profile: ArchitectureProfile,
12
+ tensor_parallel_size: int,
13
+ entry: EngineCompatEntry | None,
14
+ max_model_len: int | None = None,
15
+ ) -> str:
16
+ """Generate a multi-line `vllm serve ...` command string.
17
+
18
+ If `entry` is given, appends required_flags and optional_flags verbatim.
19
+ """
20
+ lines: list[str] = [
21
+ "vllm serve " + model_id,
22
+ f" --tensor-parallel-size {tensor_parallel_size}",
23
+ ]
24
+
25
+ # Pick max-model-len from profile if caller didn't override.
26
+ effective_max = max_model_len
27
+ if effective_max is None and profile.position is not None:
28
+ effective_max = profile.position.max_position_embeddings
29
+ if effective_max:
30
+ lines.append(f" --max-model-len {effective_max}")
31
+
32
+ # DeepSeek and friends need trust-remote-code. Heuristic: non-trivial model_type.
33
+ if _needs_trust_remote_code(profile.model_type):
34
+ lines.append(" --trust-remote-code")
35
+
36
+ lines.append(" --gpu-memory-utilization 0.9")
37
+
38
+ if entry is not None:
39
+ for flag in entry.required_flags:
40
+ lines.append(" " + _render_flag(flag.flag, flag.value))
41
+ for flag in entry.optional_flags:
42
+ lines.append(" " + _render_flag(flag.flag, flag.value))
43
+
44
+ return " \\\n".join(lines)
45
+
46
+
47
+ def _render_flag(flag: str, value: str | None) -> str:
48
+ if value is None:
49
+ return flag
50
+ return f"{flag} {value}"
51
+
52
+
53
+ def _needs_trust_remote_code(model_type: str) -> bool:
54
+ """Models that ship custom modeling code in the repo."""
55
+ return model_type.startswith(("deepseek", "qwen2_moe", "qwen3_moe", "mixtral"))
src/llm_cal/common/__init__.py ADDED
File without changes
src/llm_cal/common/i18n.py ADDED
@@ -0,0 +1,421 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Minimal i18n layer. No gettext, no external deps.
2
+
3
+ Supports `en` and `zh`. Defaults to `en` but auto-detects from LC_ALL/LANG
4
+ when they start with `zh` (covers zh_CN, zh_TW, zh_HK, etc.).
5
+
6
+ Usage:
7
+ from llm_cal.common.i18n import t, set_locale
8
+ set_locale("zh")
9
+ print(t("labels.legend")) # "标签"
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import os
15
+ from typing import Literal
16
+
17
+ Locale = Literal["en", "zh"]
18
+
19
+ _current_locale: Locale = "en"
20
+
21
+
22
+ _MESSAGES: dict[str, dict[Locale, str]] = {
23
+ # CLI help text
24
+ "cli.help": {
25
+ "en": "LLM inference hardware calculator.",
26
+ "zh": "大模型推理硬件计算器。",
27
+ },
28
+ "cli.arg.model_id": {
29
+ "en": "HuggingFace or ModelScope model id",
30
+ "zh": "HuggingFace 或 ModelScope 的 model id",
31
+ },
32
+ "cli.opt.gpu": {
33
+ "en": "GPU type, e.g. H800, A100-80G",
34
+ "zh": "GPU 型号,例如 H800、A100-80G",
35
+ },
36
+ "cli.opt.engine": {
37
+ "en": "Inference engine: vllm | sglang",
38
+ "zh": "推理引擎:vllm | sglang",
39
+ },
40
+ "cli.opt.gpu_count": {
41
+ "en": "Force GPU count (otherwise tool recommends min/dev/prod)",
42
+ "zh": "强制指定 GPU 张数(默认由工具推荐 min/dev/prod 三档)",
43
+ },
44
+ "cli.opt.context_length": {
45
+ "en": "Context length for KV cache estimation",
46
+ "zh": "用于 KV cache 估算的上下文长度",
47
+ },
48
+ "cli.opt.refresh": {
49
+ "en": "Bypass cache and re-fetch",
50
+ "zh": "绕过缓存重新拉取",
51
+ },
52
+ "cli.opt.lang": {
53
+ "en": "Output language: en | zh",
54
+ "zh": "输出语言:en | zh",
55
+ },
56
+ "cli.err.auth_required": {
57
+ "en": "Authentication required:",
58
+ "zh": "需要认证:",
59
+ },
60
+ "cli.err.model_not_found": {
61
+ "en": "Model not found:",
62
+ "zh": "模型未找到:",
63
+ },
64
+ "cli.err.source_unavailable": {
65
+ "en": "Source unavailable:",
66
+ "zh": "数据源不可用:",
67
+ },
68
+ # Panel / section titles
69
+ "panel.via": {"en": "via", "zh": "来源"},
70
+ "section.architecture": {"en": "Architecture", "zh": "架构"},
71
+ "section.weights": {"en": "Weights", "zh": "权重"},
72
+ "section.kv_cache": {
73
+ "en": "KV cache per request (BF16/FP16)",
74
+ "zh": "单请求 KV Cache(BF16/FP16)",
75
+ },
76
+ "section.reconciliation": {
77
+ "en": "Quantization reconciliation (observed vs predicted per scheme)",
78
+ "zh": "量化方案对账(观测值 vs 各方案预测值)",
79
+ },
80
+ "section.engine_compat": {
81
+ "en": "Engine compatibility",
82
+ "zh": "推理引擎兼容性",
83
+ },
84
+ "section.hardware": {"en": "Target hardware", "zh": "目标硬件"},
85
+ "section.labels": {"en": "labels:", "zh": "标签:"},
86
+ # Architecture row labels
87
+ "arch.model_type": {"en": "model_type", "zh": "模型类型"},
88
+ "arch.family": {"en": "family", "zh": "架构族"},
89
+ "arch.confidence": {"en": "confidence", "zh": "识别置信度"},
90
+ "arch.layers": {"en": "layers", "zh": "层数"},
91
+ "arch.hidden_size": {"en": "hidden_size", "zh": "隐藏维度"},
92
+ "arch.vocab_size": {"en": "vocab_size", "zh": "词表大小"},
93
+ "arch.attention": {"en": "attention", "zh": "注意力机制"},
94
+ "arch.compress_ratios": {"en": "compress_ratios", "zh": "压缩比数组"},
95
+ "arch.moe": {"en": "moe", "zh": "MoE"},
96
+ "arch.sliding_window": {"en": "sliding_window", "zh": "滑动窗口"},
97
+ "arch.max_position": {
98
+ "en": "max_position_embeddings",
99
+ "zh": "最大上下文长度",
100
+ },
101
+ "arch.none": {"en": "(none)", "zh": "(无)"},
102
+ "arch.compress_ratios_summary": {
103
+ "en": "len={n}, dense_layers={dense}",
104
+ "zh": "长度={n},dense 层数={dense}",
105
+ },
106
+ "arch.moe_summary": {
107
+ "en": "{routed} routed + {shared} shared, top-{topk}",
108
+ "zh": "{routed} 个 routed + {shared} 个 shared,top-{topk}",
109
+ },
110
+ "arch.attn_summary": {
111
+ "en": "{variant} (heads={heads}, kv_heads={kv_heads}, head_dim={head_dim})",
112
+ "zh": "{variant}(heads={heads},kv_heads={kv_heads},head_dim={head_dim})",
113
+ },
114
+ "arch.unsupported_state_space": {
115
+ "en": "State-space models are not supported in v0.1 (planned for v0.3+).",
116
+ "zh": "状态空间模型(Mamba 类)在 v0.1 暂不支持,计划在 v0.3+ 加入。",
117
+ },
118
+ # Weights rows
119
+ "weights.safetensors_bytes": {
120
+ "en": "safetensors bytes",
121
+ "zh": "safetensors 总字节",
122
+ },
123
+ "weights.params_estimated": {
124
+ "en": "estimated total params",
125
+ "zh": "参数量(估算)",
126
+ },
127
+ "weights.bits_per_param": {"en": "bits/param", "zh": "每参数位数"},
128
+ "weights.quant_guess": {"en": "quantization guess", "zh": "量化方案推断"},
129
+ # Reconciliation
130
+ "recon.scheme": {"en": "scheme", "zh": "量化方案"},
131
+ "recon.predicted": {"en": "predicted bytes", "zh": "预测字节"},
132
+ "recon.delta": {"en": "delta", "zh": "差值"},
133
+ "recon.error_pct": {"en": "error %", "zh": "误差 %"},
134
+ "recon.over": {"en": "over", "zh": "偏高"},
135
+ "recon.under": {"en": "under", "zh": "偏低"},
136
+ "recon.best": {"en": "best match:", "zh": "最佳匹配:"},
137
+ # KV cache
138
+ "kv.context": {"en": "context", "zh": "上下文"},
139
+ "kv.kv_cache": {"en": "KV cache", "zh": "KV Cache"},
140
+ "kv.label": {"en": "label", "zh": "标签"},
141
+ "kv.tokens": {"en": "tokens", "zh": "tokens"},
142
+ # Engine compatibility
143
+ "engine.version_spec": {"en": "version", "zh": "版本要求"},
144
+ "engine.support": {"en": "support", "zh": "支持程度"},
145
+ "engine.verification": {"en": "verification", "zh": "验证等级"},
146
+ "engine.required_flags": {"en": "required flags", "zh": "必需参数"},
147
+ "engine.optional_flags": {"en": "optional flags", "zh": "可选参数"},
148
+ "engine.caveats": {"en": "caveats", "zh": "注意事项"},
149
+ "engine.sources": {"en": "sources", "zh": "来源"},
150
+ "engine.no_match": {
151
+ "en": "No compatibility entry for this model + engine in v0.1 matrix.",
152
+ "zh": "v0.1 兼容矩阵中暂无此模型 + 引擎的条目。",
153
+ },
154
+ # Hardware
155
+ "hw.memory": {"en": "memory", "zh": "显存"},
156
+ "hw.nvlink_bandwidth": {"en": "NVLink bandwidth", "zh": "NVLink 带宽"},
157
+ "hw.fp16_tflops": {"en": "FP16 TFLOPS", "zh": "FP16 算力"},
158
+ "hw.fp8_support": {"en": "FP8 support", "zh": "FP8 支持"},
159
+ "hw.fp4_support": {"en": "FP4 support", "zh": "FP4 支持"},
160
+ "hw.notes": {"en": "notes", "zh": "备注"},
161
+ "hw.spec_source": {"en": "spec source", "zh": "规格来源"},
162
+ # GPU list subcommand
163
+ "gpus.list.title": {
164
+ "en": "Supported GPUs",
165
+ "zh": "支持的 GPU",
166
+ },
167
+ "gpus.col.id": {"en": "id", "zh": "型号"},
168
+ "gpus.col.memory": {"en": "memory", "zh": "显存"},
169
+ "gpus.col.nvlink": {"en": "NVLink / fabric", "zh": "互联带宽"},
170
+ "gpus.col.fp16": {"en": "FP16 TFLOPS", "zh": "FP16"},
171
+ "gpus.col.fp8": {"en": "FP8", "zh": "FP8"},
172
+ "gpus.col.fp4": {"en": "FP4", "zh": "FP4"},
173
+ "gpus.col.aliases": {"en": "aliases", "zh": "别名"},
174
+ "gpus.total": {
175
+ "en": "Total: {count} GPUs (pass any id or alias to --gpu)",
176
+ "zh": "共 {count} 款(--gpu 后面填 ID 或别名均可)",
177
+ },
178
+ "hw.unknown": {
179
+ "en": "Unknown GPU '{gpu}'. Known: {known}",
180
+ "zh": "未知 GPU '{gpu}'。已知型号:{known}",
181
+ },
182
+ "hw.bool_yes": {"en": "yes", "zh": "是"},
183
+ "hw.bool_no": {"en": "no", "zh": "否"},
184
+ # Labels — localized display names. Enum identity stays English.
185
+ "label.verified": {"en": "verified", "zh": "已验证"},
186
+ "label.inferred": {"en": "inferred", "zh": "推断"},
187
+ "label.estimated": {"en": "estimated", "zh": "估算"},
188
+ "label.cited": {"en": "cited", "zh": "引用"},
189
+ "label.unverified": {"en": "unverified", "zh": "未经验证"},
190
+ "label.unknown": {"en": "unknown", "zh": "未知"},
191
+ "label.llm-opinion": {"en": "llm-opinion", "zh": "LLM 观点"},
192
+ # Source attribution
193
+ "source.pr": {"en": "PR", "zh": "PR"},
194
+ "source.release_notes": {"en": "release notes", "zh": "release note"},
195
+ "source.announcement": {"en": "announcement", "zh": "官方公告"},
196
+ "source.tested": {"en": "tested", "zh": "实测"},
197
+ "source.captured_on": {"en": "captured on", "zh": "采集于"},
198
+ # Fleet planner
199
+ "section.fleet": {
200
+ "en": "Recommended fleet",
201
+ "zh": "推荐 GPU 张数",
202
+ },
203
+ "fleet.col.tier": {"en": "tier", "zh": "档位"},
204
+ "fleet.col.gpus": {"en": "GPUs", "zh": "GPU 数"},
205
+ "fleet.col.weight_per_gpu": {
206
+ "en": "weight / GPU",
207
+ "zh": "单卡权重",
208
+ },
209
+ "fleet.col.headroom_per_gpu": {
210
+ "en": "headroom / GPU",
211
+ "zh": "单卡余量",
212
+ },
213
+ "fleet.col.fit": {"en": "fit", "zh": "评估"},
214
+ "fleet.col.concurrent_at_ctx": {
215
+ "en": "concurrent @ {ctx}",
216
+ "zh": "并发 @ {ctx}",
217
+ },
218
+ "fleet.tier.min": {"en": "min", "zh": "最小"},
219
+ "fleet.tier.dev": {"en": "dev", "zh": "开发"},
220
+ "fleet.tier.prod": {"en": "prod", "zh": "生产"},
221
+ "fleet.best_marker": {
222
+ "en": "= recommended",
223
+ "zh": "= 推荐档位",
224
+ },
225
+ "fleet.constraint": {"en": "constraint:", "zh": "约束:"},
226
+ "fleet.forced": {
227
+ "en": "Forced GPU count (--gpu-count was set)",
228
+ "zh": "已强制指定 GPU 张数(--gpu-count)",
229
+ },
230
+ "fleet.gpu_spec_unknown": {
231
+ "en": "Fleet planning skipped — GPU spec unknown.",
232
+ "zh": "GPU 规格未知,跳过 fleet 规划。",
233
+ },
234
+ # Command generator
235
+ "section.command": {
236
+ "en": "Generated command",
237
+ "zh": "生成的启动命令",
238
+ },
239
+ "command.tier_note": {
240
+ "en": "tier: {tier} ({gpus} GPUs)",
241
+ "zh": "档位:{tier}({gpus} 张)",
242
+ },
243
+ # Performance section
244
+ "section.performance": {
245
+ "en": "Performance analysis",
246
+ "zh": "性能分析",
247
+ },
248
+ "perf.assumptions_note": {
249
+ "en": (
250
+ "Assumes input={input_tokens} tokens, output={output_tokens} tokens, "
251
+ "target {target_tps} tok/s per user. "
252
+ "Utilization: prefill={prefill_util:.0%} / decode_bw={decode_util:.0%} "
253
+ "/ concurrency_degradation={degradation:.2f}x. "
254
+ "All numbers are [estimated] — see docs/methodology.md for formula sources "
255
+ "and override via --prefill-util / --decode-bw-util / --concurrency-degradation."
256
+ ),
257
+ "zh": (
258
+ "假设输入 {input_tokens} tokens、输出 {output_tokens} tokens、"
259
+ "每用户目标 {target_tps} tok/s。"
260
+ "利用率:prefill={prefill_util:.0%} / decode_bw={decode_util:.0%} "
261
+ "/ 并发退化={degradation:.2f}x。"
262
+ "所有数字都是 [估算]——公式来源见 docs/methodology.md,"
263
+ "可通过 --prefill-util / --decode-bw-util / --concurrency-degradation 覆盖。"
264
+ ),
265
+ },
266
+ "perf.prefill_latency": {
267
+ "en": "Prefill latency (single request)",
268
+ "zh": "Prefill 延迟(单请求)",
269
+ },
270
+ "perf.decode_throughput_cluster": {
271
+ "en": "Decode throughput (cluster)",
272
+ "zh": "Decode 吞吐(集群)",
273
+ },
274
+ "perf.decode_throughput_per_gpu": {
275
+ "en": "Decode throughput (per GPU)",
276
+ "zh": "Decode 吞吐(单卡)",
277
+ },
278
+ "perf.decode_moe_active_optimistic": {
279
+ "en": "Decode throughput (MoE active-only, optimistic)",
280
+ "zh": "Decode 吞吐(MoE 仅激活专家,乐观估算)",
281
+ },
282
+ "perf.k_bound": {
283
+ "en": "K bound (memory-capacity)",
284
+ "zh": "K 上限(显存容量)",
285
+ },
286
+ "perf.l_bound": {
287
+ "en": "L bound (compute / bandwidth @ SLA)",
288
+ "zh": "L 上限(算力/带宽 @ SLA)",
289
+ },
290
+ "perf.max_concurrent": {
291
+ "en": "Max concurrent",
292
+ "zh": "最大并发",
293
+ },
294
+ "perf.bottleneck": {
295
+ "en": "Bottleneck",
296
+ "zh": "瓶颈类型",
297
+ },
298
+ "perf.bottleneck.memory_capacity": {
299
+ "en": "Memory capacity",
300
+ "zh": "显存容量",
301
+ },
302
+ "perf.bottleneck.memory_bandwidth": {
303
+ "en": "Memory bandwidth / compute",
304
+ "zh": "显存带宽 / 算力",
305
+ },
306
+ "perf.bottleneck.compute": {
307
+ "en": "Compute",
308
+ "zh": "算力",
309
+ },
310
+ "perf.bottleneck.insufficient_data": {
311
+ "en": "Insufficient data",
312
+ "zh": "数据不足",
313
+ },
314
+ "perf.optimization.header": {
315
+ "en": "Optimization suggestions",
316
+ "zh": "优化建议",
317
+ },
318
+ "perf.opt.quantize_int4": {
319
+ "en": "Quantize to INT4: weight bytes halve → decode tok/s roughly 2× → concurrency scales accordingly.",
320
+ "zh": "量化到 INT4:权重字节减半 → decode tok/s 约翻倍 → 并发能力随之提升。",
321
+ },
322
+ "perf.opt.relax_sla": {
323
+ "en": "Relax SLA: if per-user target drops to 15 tok/s, L bound roughly doubles.",
324
+ "zh": "放宽 SLA:若每用户目标降至 15 tok/s,L 上限约翻倍。",
325
+ },
326
+ "perf.opt.kv_fp8": {
327
+ "en": "KV cache FP8 quantization: halves per-request KV, doubles the K bound at long context.",
328
+ "zh": "KV cache 量化到 FP8:单请求 KV 减半,长上下文下 K 上限约翻倍。",
329
+ },
330
+ "perf.opt.moe_offload": {
331
+ "en": "MoE expert offload to CPU: frees HBM for more KV cache at the cost of PCIe latency per new expert.",
332
+ "zh": "MoE 专家卸载到 CPU:释放 HBM 给 KV cache,代价是新专家激活时的 PCIe 延迟。",
333
+ },
334
+ # Explain section
335
+ "section.explain": {
336
+ "en": "Full derivation traces (--explain)",
337
+ "zh": "完整推导链(--explain)",
338
+ },
339
+ "explain.formula": {"en": "Formula", "zh": "公式"},
340
+ "explain.inputs": {"en": "Inputs", "zh": "输入"},
341
+ "explain.steps": {"en": "Computation", "zh": "计算步骤"},
342
+ "explain.result": {"en": "Result", "zh": "结果"},
343
+ "explain.source": {"en": "Source", "zh": "来源"},
344
+ "explain.see_also": {"en": "See also", "zh": "延伸阅读"},
345
+ "explain.intro": {
346
+ "en": (
347
+ "Each entry below shows the formula used, the inputs that went in, "
348
+ "every computation step, and the primary source. "
349
+ "Paste any single entry into an LLM and ask 'does this math check out?' "
350
+ "— the tool stays deterministic, the second opinion is yours."
351
+ ),
352
+ "zh": (
353
+ "下面每一项都给出所用公式、输入、每一步计算、主要来源。"
354
+ "把任一项复制粘贴给 LLM,问『这个推理对吗』即可。"
355
+ "工具保持确定性,second opinion 交给你。"
356
+ ),
357
+ },
358
+ # LLM review section
359
+ "section.llm_review": {
360
+ "en": "LLM second opinion (--llm-review, EXPERIMENTAL)",
361
+ "zh": "LLM 审阅(--llm-review,实验性)",
362
+ },
363
+ "llm_review.disclaimer": {
364
+ "en": (
365
+ "⚠ This is a second opinion from an external LLM ({model} via {base_url}). "
366
+ "It is tagged [llm-opinion] and NEVER overrides the 6 primary labels. "
367
+ "LLMs can be wrong; the tool's deterministic output takes precedence."
368
+ ),
369
+ "zh": (
370
+ "⚠ 以下是来自外部 LLM({model},经 {base_url})的第二意见。"
371
+ "标签为 [LLM 观点],**永远不覆盖** 前 6 级主标签。"
372
+ "LLM 可能出错;工具的确定性输出优先。"
373
+ ),
374
+ },
375
+ "llm_review.unavailable": {
376
+ "en": "LLM review unavailable: {error}",
377
+ "zh": "LLM 审阅不可用:{error}",
378
+ },
379
+ "llm_review.setup_hint": {
380
+ "en": (
381
+ "To enable: export LLM_CAL_REVIEWER_API_KEY=<key> "
382
+ "[optional: LLM_CAL_REVIEWER_BASE_URL, LLM_CAL_REVIEWER_MODEL]"
383
+ ),
384
+ "zh": (
385
+ "启用方法:export LLM_CAL_REVIEWER_API_KEY=<key> "
386
+ "[可选:LLM_CAL_REVIEWER_BASE_URL、LLM_CAL_REVIEWER_MODEL]"
387
+ ),
388
+ },
389
+ }
390
+
391
+
392
+ def set_locale(loc: Locale) -> None:
393
+ global _current_locale
394
+ _current_locale = loc
395
+
396
+
397
+ def get_locale() -> Locale:
398
+ return _current_locale
399
+
400
+
401
+ def detect_locale_from_env() -> Locale:
402
+ """Auto-detect from standard locale env vars."""
403
+ for var in ("LC_ALL", "LC_MESSAGES", "LANG"):
404
+ val = os.environ.get(var, "").lower()
405
+ if val.startswith("zh"):
406
+ return "zh"
407
+ return "en"
408
+
409
+
410
+ def t(key: str, **kwargs: object) -> str:
411
+ """Translate a message key. Unknown keys return the key itself (fail loud)."""
412
+ bundle = _MESSAGES.get(key)
413
+ if bundle is None:
414
+ return key
415
+ template = bundle.get(_current_locale, bundle.get("en", key))
416
+ if kwargs:
417
+ try:
418
+ return template.format(**kwargs)
419
+ except (KeyError, IndexError):
420
+ return template
421
+ return template
src/llm_cal/common/yaml_loader.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Pydantic-validated YAML loader.
2
+
3
+ Shared between engine_compat and hardware modules. Supports `lazy=True` param
4
+ (v0.1 does not implement lazy — signature reserved for v0.2 when matrix > 100).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+ from typing import TypeVar
11
+
12
+ import yaml
13
+ from pydantic import BaseModel, ValidationError
14
+
15
+ T = TypeVar("T", bound=BaseModel)
16
+
17
+
18
+ class YamlLoadError(Exception):
19
+ """YAML file could not be parsed or validated."""
20
+
21
+
22
+ def load_yaml(path: str | Path, schema: type[T], *, lazy: bool = False) -> T:
23
+ """Load + validate a YAML file against a Pydantic schema.
24
+
25
+ Args:
26
+ path: YAML file to load.
27
+ schema: Pydantic model the YAML is expected to conform to.
28
+ lazy: Reserved for v0.2 (on-demand loading of large matrices). v0.1
29
+ ignores this; document-scale data is small enough that eager
30
+ loading is fine.
31
+ """
32
+ _ = lazy # v0.1 behavior is always eager
33
+ p = Path(path)
34
+ if not p.exists():
35
+ raise YamlLoadError(f"YAML file not found: {p}")
36
+ try:
37
+ with p.open("r", encoding="utf-8") as f:
38
+ raw = yaml.safe_load(f)
39
+ except yaml.YAMLError as e:
40
+ raise YamlLoadError(f"YAML parse error in {p}: {e}") from e
41
+
42
+ if raw is None:
43
+ raise YamlLoadError(f"YAML file {p} is empty")
44
+
45
+ try:
46
+ return schema.model_validate(raw)
47
+ except ValidationError as e:
48
+ raise YamlLoadError(f"Schema validation failed for {p}:\n{e}") from e
src/llm_cal/core/__init__.py ADDED
File without changes
src/llm_cal/core/cache.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Disk cache for model-source responses.
2
+
3
+ Key design decisions (from /plan-eng-review Issue #2 + Issue #10 critical):
4
+
5
+ - Key = (source, model_id, commit_sha). Commit sha is included so a repo update
6
+ invalidates cache automatically — prevents the critical regression of serving
7
+ stale data after the upstream model updates.
8
+ - TTL = 7 days default. Even without a commit change, we force re-fetch weekly.
9
+ - `--refresh` flag sets `bypass=True` on `get()` — caller drives it.
10
+ - Store location: platformdirs user cache dir, subdirectory `llm-cal`.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from dataclasses import asdict, dataclass, is_dataclass
16
+ from pathlib import Path
17
+ from typing import Any
18
+
19
+ import diskcache
20
+ from platformdirs import user_cache_dir
21
+
22
+ from llm_cal.model_source.base import ModelArtifact, SiblingFile
23
+
24
+ _DEFAULT_TTL_SECONDS = 7 * 24 * 60 * 60 # 7 days
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class CacheKey:
29
+ source: str
30
+ model_id: str
31
+ commit_sha: str | None
32
+
33
+ def to_string(self) -> str:
34
+ return f"{self.source}::{self.model_id}::{self.commit_sha or 'HEAD'}"
35
+
36
+
37
+ class ArtifactCache:
38
+ """Persistent cache for ModelArtifact instances."""
39
+
40
+ def __init__(
41
+ self, cache_dir: str | Path | None = None, ttl_seconds: int = _DEFAULT_TTL_SECONDS
42
+ ) -> None:
43
+ if cache_dir is None:
44
+ cache_dir = user_cache_dir("llm-cal", appauthor=False)
45
+ Path(cache_dir).mkdir(parents=True, exist_ok=True)
46
+ self._cache = diskcache.Cache(str(cache_dir))
47
+ self._ttl = ttl_seconds
48
+
49
+ def get(self, key: CacheKey, bypass: bool = False) -> ModelArtifact | None:
50
+ """Look up an artifact. `bypass=True` always returns None (used by --refresh).
51
+
52
+ If `key.commit_sha` is None (no revision pinning), we never serve from cache
53
+ because we can't prove freshness.
54
+ """
55
+ if bypass or key.commit_sha is None:
56
+ return None
57
+ raw = self._cache.get(key.to_string())
58
+ if raw is None:
59
+ return None
60
+ return _deserialize_artifact(raw)
61
+
62
+ def set(self, key: CacheKey, artifact: ModelArtifact) -> None:
63
+ """Cache an artifact. No-op if commit_sha is None (can't guarantee freshness)."""
64
+ if key.commit_sha is None:
65
+ return
66
+ self._cache.set(key.to_string(), _serialize_artifact(artifact), expire=self._ttl)
67
+
68
+ def invalidate(self, key: CacheKey) -> bool:
69
+ """Explicit invalidation, returns True if something was removed."""
70
+ return bool(self._cache.delete(key.to_string()))
71
+
72
+ def clear(self) -> None:
73
+ """Wipe the whole cache — for tests and `llm-cal cache clear` (future)."""
74
+ self._cache.clear()
75
+
76
+ def close(self) -> None:
77
+ self._cache.close()
78
+
79
+
80
+ def _serialize_artifact(a: ModelArtifact) -> dict[str, Any]:
81
+ return {
82
+ "source": a.source,
83
+ "model_id": a.model_id,
84
+ "commit_sha": a.commit_sha,
85
+ "config": a.config,
86
+ "siblings": [asdict(s) if is_dataclass(s) else s for s in a.siblings],
87
+ }
88
+
89
+
90
+ def _deserialize_artifact(raw: dict[str, Any]) -> ModelArtifact:
91
+ return ModelArtifact(
92
+ source=raw["source"],
93
+ model_id=raw["model_id"],
94
+ commit_sha=raw["commit_sha"],
95
+ config=raw["config"],
96
+ siblings=tuple(SiblingFile(**s) for s in raw["siblings"]),
97
+ )
src/llm_cal/core/evaluator.py ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Evaluator — the single orchestration layer.
2
+
3
+ v0.1 partial implementation: composes model_source + detector + weight_analyzer
4
+ + reconciler + kv_cache + engine_compat + hardware. Fleet planner and command
5
+ generator land in Week 5 remainder.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass, field
11
+
12
+ from llm_cal.architecture.detector import detect
13
+ from llm_cal.architecture.formulas.kv_cache import compute_kv_cache_bytes
14
+ from llm_cal.architecture.formulas.weight import estimate_total_params
15
+ from llm_cal.architecture.profile import ArchitectureProfile
16
+ from llm_cal.command_generator.sglang import generate_sglang_command
17
+ from llm_cal.command_generator.vllm import generate_vllm_command
18
+ from llm_cal.core.cache import ArtifactCache, CacheKey
19
+ from llm_cal.engine_compat.loader import EngineCompatEntry, find_match
20
+ from llm_cal.fleet.planner import FleetRecommendation, plan
21
+ from llm_cal.hardware.loader import GPUSpec, UnknownGPUError, lookup
22
+ from llm_cal.model_source.base import ModelArtifact, ModelSource
23
+ from llm_cal.model_source.huggingface import HuggingFaceSource
24
+ from llm_cal.output.labels import AnnotatedValue
25
+ from llm_cal.performance.compute import (
26
+ DEFAULT_DECODE_BW_UTILIZATION,
27
+ DEFAULT_PREFILL_UTILIZATION,
28
+ DecodeEstimate,
29
+ PrefillEstimate,
30
+ estimate_decode,
31
+ estimate_prefill,
32
+ )
33
+ from llm_cal.performance.concurrency import ConcurrencyAnalysis
34
+ from llm_cal.performance.concurrency import analyze as analyze_concurrency
35
+ from llm_cal.weight_analyzer import WeightReport, analyze
36
+ from llm_cal.weight_analyzer.fingerprint import (
37
+ QuantFingerprint,
38
+ from_config,
39
+ from_safetensors_dtypes,
40
+ )
41
+ from llm_cal.weight_analyzer.reconciler import ReconciliationReport, reconcile
42
+ from llm_cal.weight_analyzer.safetensors_reader import (
43
+ fetch_tensor_dtypes,
44
+ pick_sample_shard,
45
+ )
46
+
47
+ _KV_REFERENCE_CTX = 131_072 # matches fleet.planner's _REFERENCE_CTX_TOKENS
48
+
49
+
50
+ @dataclass(frozen=True)
51
+ class EvaluationReport:
52
+ """Everything the evaluator produces for one model."""
53
+
54
+ model_id: str
55
+ source: str
56
+ commit_sha: str | None
57
+ gpu: str
58
+ gpu_spec: GPUSpec | None
59
+ gpu_error: str | None # message if gpu wasn't found
60
+ engine: str
61
+ profile: ArchitectureProfile
62
+ weight: WeightReport
63
+ total_params_estimate: AnnotatedValue[int]
64
+ reconciliation: ReconciliationReport
65
+ kv_cache_by_context: dict[int, AnnotatedValue[int]] = field(default_factory=dict)
66
+ engine_match: EngineCompatEntry | None = None
67
+ fleet: FleetRecommendation | None = None
68
+ generated_command: str | None = None
69
+ # Performance analysis — filled when user passes SLA args (or defaults).
70
+ prefill: PrefillEstimate | None = None
71
+ decode: DecodeEstimate | None = None
72
+ concurrency: ConcurrencyAnalysis | None = None
73
+ perf_input_tokens: int | None = None
74
+ perf_output_tokens: int | None = None
75
+ perf_target_tokens_per_sec: float | None = None
76
+
77
+
78
+ class Evaluator:
79
+ """Orchestrates: model_source -> detect -> analyze -> reconcile -> KV cache
80
+ -> engine compat -> hardware lookup.
81
+
82
+ Fleet planning and command generation are remaining Week 5 additions.
83
+ """
84
+
85
+ def __init__(
86
+ self,
87
+ source: ModelSource | None = None,
88
+ cache: ArtifactCache | None = None,
89
+ ) -> None:
90
+ self._source = source or HuggingFaceSource()
91
+ self._cache = cache or ArtifactCache()
92
+
93
+ def evaluate(
94
+ self,
95
+ model_id: str,
96
+ gpu: str,
97
+ engine: str,
98
+ gpu_count: int | None = None,
99
+ context_length: int | None = None,
100
+ refresh: bool = False,
101
+ input_tokens: int | None = None,
102
+ output_tokens: int | None = None,
103
+ target_tokens_per_sec: float | None = None,
104
+ prefill_utilization: float = DEFAULT_PREFILL_UTILIZATION,
105
+ decode_bw_utilization: float = DEFAULT_DECODE_BW_UTILIZATION,
106
+ concurrency_degradation: float = 1.0,
107
+ ) -> EvaluationReport:
108
+ artifact = self._fetch(model_id, refresh=refresh)
109
+ profile = detect(artifact.config)
110
+
111
+ total_params_est = estimate_total_params(profile)
112
+ total_params = total_params_est.value
113
+
114
+ observed_bytes_for_fp = sum(
115
+ (s.size or 0) for s in artifact.siblings if s.filename.endswith(".safetensors")
116
+ )
117
+ fingerprint = self._resolve_quant_fingerprint(
118
+ artifact,
119
+ observed_bytes=observed_bytes_for_fp,
120
+ total_params=total_params if total_params > 0 else 0,
121
+ )
122
+ weight = analyze(
123
+ artifact.siblings,
124
+ total_params=total_params if total_params > 0 else None,
125
+ fingerprint=fingerprint,
126
+ )
127
+ reconciliation = reconcile(
128
+ weight.total_bytes.value,
129
+ total_params or 1,
130
+ fingerprint=fingerprint,
131
+ )
132
+
133
+ contexts_to_report = self._select_context_lengths(profile, context_length)
134
+ kv_by_ctx = {
135
+ ctx: compute_kv_cache_bytes(profile, seq_len=ctx, dtype_bytes=2)
136
+ for ctx in contexts_to_report
137
+ }
138
+
139
+ # Engine compatibility — match by model_type alone (v0.1). Version
140
+ # filtering can be added via a future --engine-version flag.
141
+ engine_match = find_match(engine=engine, model_type=profile.model_type)
142
+
143
+ # Hardware lookup — never raises out to CLI, we embed the error message
144
+ # so the user sees a partial report instead of aborting.
145
+ gpu_spec: GPUSpec | None = None
146
+ gpu_error: str | None = None
147
+ try:
148
+ gpu_spec = lookup(gpu)
149
+ except UnknownGPUError as e:
150
+ gpu_error = str(e)
151
+
152
+ # Fleet planning — only if we have a known GPU. The planner's reference
153
+ # context is 128K; derive KV bytes there (computing fresh in case the
154
+ # user chose a non-overlapping context_length override).
155
+ fleet: FleetRecommendation | None = None
156
+ generated_command: str | None = None
157
+ if gpu_spec is not None and weight.total_bytes.value > 0:
158
+ kv_ref = compute_kv_cache_bytes(profile, _KV_REFERENCE_CTX, dtype_bytes=2)
159
+ kv_by_context_bytes = {ctx: av.value for ctx, av in kv_by_ctx.items() if av.value > 0}
160
+ fleet = plan(
161
+ profile=profile,
162
+ weight_bytes=weight.total_bytes.value,
163
+ kv_bytes_per_request_at_ref=max(1, kv_ref.value),
164
+ gpu=gpu_spec,
165
+ forced_gpu_count=gpu_count,
166
+ kv_bytes_by_context=kv_by_context_bytes,
167
+ )
168
+ # Pick the gpu_count to emit the command for: user's forced value,
169
+ # else the best_tier's recommendation.
170
+ chosen_count = gpu_count or next(
171
+ (o.gpu_count for o in fleet.options if o.tier == fleet.best_tier),
172
+ fleet.options[0].gpu_count,
173
+ )
174
+ generated_command = _generate_command(
175
+ engine=engine,
176
+ model_id=model_id,
177
+ profile=profile,
178
+ tp=chosen_count,
179
+ entry=engine_match,
180
+ max_model_len=context_length,
181
+ )
182
+
183
+ # Performance analysis — runs whenever we have hardware + fleet.
184
+ prefill_est: PrefillEstimate | None = None
185
+ decode_est: DecodeEstimate | None = None
186
+ concurrency_est: ConcurrencyAnalysis | None = None
187
+ if gpu_spec is not None and fleet is not None and total_params > 0:
188
+ # Pick the fleet tier we're analyzing (user's forced count or best tier).
189
+ chosen = gpu_count or next(
190
+ (o.gpu_count for o in fleet.options if o.tier == fleet.best_tier),
191
+ fleet.options[0].gpu_count,
192
+ )
193
+ # Resolve performance defaults when user didn't specify.
194
+ eff_input = input_tokens or 2000
195
+ eff_target = target_tokens_per_sec or 30.0
196
+
197
+ prefill_est = estimate_prefill(
198
+ profile=profile,
199
+ total_params=total_params,
200
+ gpu=gpu_spec,
201
+ num_gpus=chosen,
202
+ input_tokens=eff_input,
203
+ utilization=prefill_utilization,
204
+ )
205
+ # MoE active ratio: active/total = (shared + experts_per_tok) / (shared + routed)
206
+ moe_active_ratio: float | None = None
207
+ if profile.moe is not None:
208
+ active_experts = profile.moe.num_experts_per_tok + profile.moe.num_shared_experts
209
+ total_experts = profile.moe.num_routed_experts + profile.moe.num_shared_experts
210
+ if total_experts > 0:
211
+ moe_active_ratio = active_experts / total_experts
212
+ decode_est = estimate_decode(
213
+ profile=profile,
214
+ total_weight_bytes=weight.total_bytes.value,
215
+ gpu=gpu_spec,
216
+ num_gpus=chosen,
217
+ bw_utilization=decode_bw_utilization,
218
+ moe_active_params_ratio=moe_active_ratio,
219
+ )
220
+ # Compute cluster headroom at the chosen tier + KV per request at the
221
+ # *longest* surveyed context (most conservative).
222
+ chosen_option = next(
223
+ (o for o in fleet.options if o.gpu_count == chosen),
224
+ fleet.options[-1],
225
+ )
226
+ headroom_per_gpu = (
227
+ chosen_option.usable_bytes_per_gpu - chosen_option.weight_bytes_per_gpu
228
+ )
229
+ # Cluster-wide headroom is per-GPU * N; currently we use per-GPU view below.
230
+ # Reference context for the L bound: match K's headroom context (128K
231
+ # if model supports it, else max).
232
+ kv_ref_ctx = 131_072 if 131_072 in kv_by_ctx else max(kv_by_ctx.keys())
233
+ kv_ref_bytes: int = kv_by_ctx[kv_ref_ctx].value
234
+ # Apply TP-aware sharding (same rule fleet planner uses).
235
+ from llm_cal.fleet.planner import _kv_shards
236
+
237
+ shards = _kv_shards(profile, chosen)
238
+ kv_ref_per_gpu = max(1, kv_ref_bytes // shards)
239
+ # Request KV lives per-GPU; under replication, it's the same value on all.
240
+ # We compare cluster headroom against per-GPU KV (each request consumes
241
+ # per-GPU KV on every rank simultaneously).
242
+ # To convert to "how many requests fit", we divide *per-GPU* headroom
243
+ # by *per-GPU* KV.
244
+ headroom_per_req_view = max(0, headroom_per_gpu)
245
+ concurrency_est = analyze_concurrency(
246
+ cluster_headroom_bytes=headroom_per_req_view,
247
+ kv_bytes_per_request=kv_ref_per_gpu,
248
+ decode=decode_est,
249
+ target_tokens_per_sec=eff_target,
250
+ degradation=concurrency_degradation,
251
+ )
252
+
253
+ return EvaluationReport(
254
+ model_id=model_id,
255
+ source=artifact.source,
256
+ commit_sha=artifact.commit_sha,
257
+ gpu=gpu,
258
+ gpu_spec=gpu_spec,
259
+ gpu_error=gpu_error,
260
+ engine=engine,
261
+ profile=profile,
262
+ weight=weight,
263
+ total_params_estimate=total_params_est,
264
+ reconciliation=reconciliation,
265
+ kv_cache_by_context=kv_by_ctx,
266
+ engine_match=engine_match,
267
+ fleet=fleet,
268
+ generated_command=generated_command,
269
+ prefill=prefill_est,
270
+ decode=decode_est,
271
+ concurrency=concurrency_est,
272
+ perf_input_tokens=input_tokens or 2000 if fleet else None,
273
+ perf_output_tokens=output_tokens or 512 if fleet else None,
274
+ perf_target_tokens_per_sec=target_tokens_per_sec or 30.0 if fleet else None,
275
+ )
276
+
277
+ def _fetch(self, model_id: str, refresh: bool) -> ModelArtifact:
278
+ artifact = self._source.fetch(model_id)
279
+ key = CacheKey(
280
+ source=self._source.name,
281
+ model_id=model_id,
282
+ commit_sha=artifact.commit_sha,
283
+ )
284
+ cached = self._cache.get(key, bypass=refresh)
285
+ if cached is not None:
286
+ return cached
287
+ self._cache.set(key, artifact)
288
+ return artifact
289
+
290
+ def _resolve_quant_fingerprint(
291
+ self,
292
+ artifact: ModelArtifact,
293
+ observed_bytes: int,
294
+ total_params: int,
295
+ ) -> QuantFingerprint | None:
296
+ """Resolve the quantization scheme via authoritative evidence.
297
+
298
+ Priority:
299
+ 1. config.json `quantization_config` — explicit author declaration.
300
+ Free, no extra network call. But if its predicted bytes are
301
+ wildly off (>15% from observed), fall through — config.json
302
+ can be incomplete or stale (DeepSeek-V4-Flash declares
303
+ `quant_method=fp8` but ships an FP4+FP8 mixed pack; trusting
304
+ the declaration produces a 45% wrong answer).
305
+ 2. safetensors file header — per-tensor dtype fingerprint. One
306
+ Range GET on the first shard. Ground truth.
307
+
308
+ Returns None on any failure. The reconciler falls back to bytes-only
309
+ argmin in that case (v0.1.1 behavior).
310
+ """
311
+ fp = from_config(artifact.config)
312
+ if fp is not None and self._fingerprint_matches_bytes(fp, observed_bytes, total_params):
313
+ return fp
314
+
315
+ shard = pick_sample_shard(artifact.siblings)
316
+ if shard is None:
317
+ return fp # safetensors unavailable — best we can do is the config hint
318
+
319
+ dtypes = fetch_tensor_dtypes(
320
+ source=artifact.source,
321
+ model_id=artifact.model_id,
322
+ revision=artifact.commit_sha or "main",
323
+ shard_filename=shard.filename,
324
+ )
325
+ if not dtypes:
326
+ return fp
327
+
328
+ st_fp = from_safetensors_dtypes(dtypes)
329
+ # Header is ground truth — prefer it over config when both exist.
330
+ return st_fp if st_fp is not None else fp
331
+
332
+ @staticmethod
333
+ def _fingerprint_matches_bytes(
334
+ fp: QuantFingerprint, observed_bytes: int, total_params: int
335
+ ) -> bool:
336
+ """Sanity-check a fingerprint's predicted bytes against observed.
337
+
338
+ Returns True if the declared scheme's predicted bytes are within 15%
339
+ of observed. False means config.json is either lying or describes
340
+ only part of the model — we should consult safetensors instead.
341
+ """
342
+ from llm_cal.weight_analyzer import _QUANT_BPP
343
+
344
+ bpp = _QUANT_BPP.get(fp.scheme, 0.0)
345
+ if bpp <= 0 or total_params <= 0 or observed_bytes <= 0:
346
+ return True # can't verify — don't penalize the fingerprint
347
+ predicted = bpp * total_params
348
+ rel_err = abs(observed_bytes - predicted) / predicted
349
+ return rel_err <= 0.15
350
+
351
+ @staticmethod
352
+ def _select_context_lengths(profile: ArchitectureProfile, override: int | None) -> list[int]:
353
+ if override is not None:
354
+ return [override]
355
+ candidates = [4_096, 32_768, 131_072]
356
+ max_pos = profile.position.max_position_embeddings if profile.position else None
357
+ if max_pos and max_pos > 131_072:
358
+ candidates.append(max_pos)
359
+ if max_pos:
360
+ candidates = [c for c in candidates if c <= max_pos]
361
+ return candidates
362
+
363
+
364
+ def _generate_command(
365
+ engine: str,
366
+ model_id: str,
367
+ profile: ArchitectureProfile,
368
+ tp: int,
369
+ entry: EngineCompatEntry | None,
370
+ max_model_len: int | None,
371
+ ) -> str:
372
+ engine_norm = engine.lower().strip()
373
+ if engine_norm == "sglang":
374
+ return generate_sglang_command(model_id, profile, tp, entry, max_model_len=max_model_len)
375
+ return generate_vllm_command(model_id, profile, tp, entry, max_model_len=max_model_len)
src/llm_cal/core/explain.py ADDED
@@ -0,0 +1,504 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Full derivation traces for each non-trivial number in the report.
2
+
3
+ This module is only invoked when the user passes `--explain`. It doesn't
4
+ recompute anything — it reads the values that the main evaluator already
5
+ produced and wraps them in a formatted explanation with formula, inputs,
6
+ step-by-step computation, and primary source citation.
7
+
8
+ Design rationale: the tool's core promise is deterministic, auditable
9
+ output. `--explain` makes that auditability human-readable. A user can:
10
+ 1. Read the explanation themselves
11
+ 2. Paste it into an LLM and ask "does this math check out?"
12
+ 3. Cross-reference docs/methodology.md for the primary source
13
+ All three preserve determinism — the LLM is the user's tool, not ours.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import math
19
+ from dataclasses import dataclass, field
20
+
21
+ from llm_cal.core.evaluator import EvaluationReport
22
+
23
+
24
+ @dataclass(frozen=True)
25
+ class ExplainInput:
26
+ """One input variable to a formula."""
27
+
28
+ name: str
29
+ value: str # pre-formatted for display
30
+ label: str # e.g. "[verified]", "[estimated]"
31
+ note: str = "" # optional disambiguation
32
+
33
+
34
+ @dataclass(frozen=True)
35
+ class ExplainEntry:
36
+ """A full derivation trace for one output number."""
37
+
38
+ heading: str # localized section title, e.g. "KV cache @ 128K"
39
+ formula: str # the formula, literally
40
+ inputs: list[ExplainInput] = field(default_factory=list)
41
+ steps: list[str] = field(default_factory=list) # step-by-step computation
42
+ result: str = "" # final formatted answer with label
43
+ source: str = "" # primary source citation
44
+ methodology_anchor: str = "" # anchor in docs/methodology.md, e.g. "#prefill-latency"
45
+
46
+
47
+ def build(report: EvaluationReport) -> list[ExplainEntry]:
48
+ """Produce explanation entries in the order they appear in the main report."""
49
+ entries: list[ExplainEntry] = []
50
+
51
+ _weight_bytes(report, entries)
52
+ _quantization(report, entries)
53
+ _kv_cache_contexts(report, entries)
54
+ _fleet_tiers(report, entries)
55
+ _prefill(report, entries)
56
+ _decode(report, entries)
57
+ _concurrency(report, entries)
58
+
59
+ return entries
60
+
61
+
62
+ # ======================================================================
63
+ # Weight
64
+ # ======================================================================
65
+
66
+
67
+ def _weight_bytes(report: EvaluationReport, entries: list[ExplainEntry]) -> None:
68
+ w = report.weight.total_bytes
69
+ entries.append(
70
+ ExplainEntry(
71
+ heading="Weight bytes (safetensors file sum)",
72
+ formula="sum(sibling.size for sibling in HF model_info(files_metadata=True).siblings if sibling.endswith('.safetensors'))",
73
+ inputs=[
74
+ ExplainInput(
75
+ name="HF model_info API",
76
+ value=f"source={report.source}, sha={report.commit_sha or 'HEAD'}",
77
+ label="[verified]",
78
+ ),
79
+ ],
80
+ steps=[
81
+ f"Raw value from API = {w.value:,} bytes",
82
+ f"= {w.value / 1e9:.2f} GB",
83
+ ],
84
+ result=f"{w.value:,} bytes [verified]",
85
+ source=w.source or "HF siblings API",
86
+ methodology_anchor="#weight-bytes",
87
+ )
88
+ )
89
+
90
+
91
+ def _quantization(report: EvaluationReport, entries: list[ExplainEntry]) -> None:
92
+ r = report.reconciliation
93
+ if not r.candidates:
94
+ return
95
+ best = r.candidates[0]
96
+ cands_table = "\n".join(
97
+ f" {c.scheme:<16} predicted={c.predicted_bytes / 1e9:.2f} GB "
98
+ f"error={c.relative_error * 100:.1f}%"
99
+ for c in r.candidates[:6]
100
+ )
101
+ entries.append(
102
+ ExplainEntry(
103
+ heading="Quantization scheme (reconciliation)",
104
+ formula="best_match = argmin_scheme |observed_bytes - scheme.bpp × total_params|",
105
+ inputs=[
106
+ ExplainInput(
107
+ name="observed_bytes",
108
+ value=f"{r.observed_bytes:,}",
109
+ label="[verified]",
110
+ ),
111
+ ExplainInput(
112
+ name="total_params",
113
+ value=f"{r.total_params:,}",
114
+ label="[estimated]",
115
+ note="from architecture formula — see '#params-estimate' entry below",
116
+ ),
117
+ ],
118
+ steps=[
119
+ "For each known quantization scheme, predict total bytes = bpp × params:",
120
+ cands_table,
121
+ f"Winner: {best.scheme} at {best.relative_error * 100:.1f}% error",
122
+ ],
123
+ result=f"{r.best.value} [{r.best.label.value}]",
124
+ source="Nearest-anchor match against known bytes-per-param values",
125
+ methodology_anchor="#quantization-scheme",
126
+ )
127
+ )
128
+
129
+
130
+ # ======================================================================
131
+ # KV cache
132
+ # ======================================================================
133
+
134
+
135
+ def _kv_cache_contexts(report: EvaluationReport, entries: list[ExplainEntry]) -> None:
136
+ profile = report.profile
137
+ attn = profile.attention
138
+ if attn is None:
139
+ return
140
+
141
+ is_mla = attn.variant == "MLA"
142
+ is_csa_hca = attn.variant == "CSA_HCA"
143
+
144
+ for ctx, av in report.kv_cache_by_context.items():
145
+ if av.value == 0:
146
+ continue
147
+ # Rebuild the computation for transparency
148
+ if is_mla and attn.kv_lora_rank:
149
+ per_tok_per_layer = attn.kv_lora_rank * 2 # kv_lora_rank × dtype(2)
150
+ formula = "per_tok_per_layer = kv_lora_rank × dtype_bytes (MLA: compressed latent KV)"
151
+ inputs = [
152
+ ExplainInput("kv_lora_rank", str(attn.kv_lora_rank), "[verified]"),
153
+ ExplainInput("dtype_bytes", "2", "[verified]", note="BF16/FP16"),
154
+ ExplainInput("seq_len", f"{ctx:,}", "[verified]"),
155
+ ExplainInput("num_layers", str(profile.num_hidden_layers), "[verified]"),
156
+ ]
157
+ else:
158
+ per_tok_per_layer = 2 * attn.num_kv_heads * attn.head_dim * 2
159
+ formula = "per_tok_per_layer = 2 × num_kv_heads × head_dim × dtype_bytes (standard attention)"
160
+ inputs = [
161
+ ExplainInput("num_kv_heads", str(attn.num_kv_heads), "[verified]"),
162
+ ExplainInput("head_dim", str(attn.head_dim), "[verified]"),
163
+ ExplainInput("dtype_bytes", "2", "[verified]", note="BF16/FP16"),
164
+ ExplainInput("seq_len", f"{ctx:,}", "[verified]"),
165
+ ExplainInput("num_layers", str(profile.num_hidden_layers), "[verified]"),
166
+ ]
167
+
168
+ baseline = per_tok_per_layer * ctx * profile.num_hidden_layers
169
+ steps = [
170
+ f"per_tok_per_layer = {per_tok_per_layer:,} bytes",
171
+ f"baseline = per_tok_per_layer × seq_len × num_layers = {baseline:,} bytes",
172
+ ]
173
+
174
+ if is_csa_hca and attn.compress_ratios:
175
+ ratios = attn.compress_ratios
176
+ avg = sum(1.0 if r == 0 else 1.0 / r for r in ratios) / len(ratios)
177
+ inputs.append(
178
+ ExplainInput(
179
+ "compress_ratios",
180
+ f"len={len(ratios)} (avg keep-fraction={avg:.4f})",
181
+ "[verified]",
182
+ )
183
+ )
184
+ formula += (
185
+ "\napply_csa_hca: baseline × avg(1/r_i for r_i in compress_ratios, 0 = keep-all=1)"
186
+ )
187
+ steps.extend(
188
+ [
189
+ f"avg_keep_fraction = {avg:.4f}",
190
+ f"result = baseline × avg_keep_fraction = {av.value:,} bytes",
191
+ ]
192
+ )
193
+ else:
194
+ steps.append(f"result = baseline = {av.value:,} bytes")
195
+
196
+ entries.append(
197
+ ExplainEntry(
198
+ heading=f"KV cache @ {_fmt_ctx(ctx)} context",
199
+ formula=formula,
200
+ inputs=inputs,
201
+ steps=steps,
202
+ result=f"{av.value:,} bytes = {av.value / 1e9:.2f} GB [{av.label.value}]",
203
+ source=(
204
+ "DeepSeek-V2 paper (MLA); DeepSeek-V4 tech report (CSA+HCA); "
205
+ "standard attention formula per Attention Is All You Need (Vaswani 2017)"
206
+ ),
207
+ methodology_anchor="#kv-cache-per-request",
208
+ )
209
+ )
210
+
211
+
212
+ # ======================================================================
213
+ # Fleet tiers
214
+ # ======================================================================
215
+
216
+
217
+ def _fleet_tiers(report: EvaluationReport, entries: list[ExplainEntry]) -> None:
218
+ if report.fleet is None or report.gpu_spec is None:
219
+ return
220
+
221
+ # One explain block per tier (min / dev / prod)
222
+ for opt in report.fleet.options:
223
+ tier_label = opt.tier
224
+ headroom = opt.usable_bytes_per_gpu - opt.weight_bytes_per_gpu
225
+ steps = [
226
+ f"per-GPU HBM usable (@ 90% util) = {opt.usable_bytes_per_gpu:,} bytes",
227
+ f"weight per GPU = total_weight / TP_size = "
228
+ f"{report.weight.total_bytes.value:,} / {opt.gpu_count} = "
229
+ f"{opt.weight_bytes_per_gpu:,} bytes",
230
+ f"headroom per GPU = usable - weight = {headroom:,} bytes ({headroom / 1e9:.2f} GB)",
231
+ ]
232
+ fit_criterion = {"min": 1, "dev": 8, "prod": 16}.get(tier_label, 1)
233
+ steps.append(
234
+ f"tier criterion: headroom ≥ weight_per_gpu + {fit_criterion} × kv_per_request_128K"
235
+ )
236
+ steps.append(
237
+ f"smallest TP count in {list(report.fleet.valid_tp_sizes)} that "
238
+ f"satisfies the criterion: {opt.gpu_count}"
239
+ )
240
+ if not opt.fits:
241
+ steps.append(
242
+ f"NOTE: does not fit the criterion — the chosen {opt.gpu_count} "
243
+ "is the best available."
244
+ )
245
+
246
+ entries.append(
247
+ ExplainEntry(
248
+ heading=f"Fleet tier: {tier_label} ({opt.gpu_count} GPUs)",
249
+ formula=(
250
+ "smallest TP in valid_set where "
251
+ "weight_per_gpu + concurrent × kv_per_request ≤ usable_per_gpu"
252
+ ),
253
+ inputs=[
254
+ ExplainInput(
255
+ "total_weight_bytes",
256
+ f"{report.weight.total_bytes.value:,}",
257
+ "[verified]",
258
+ ),
259
+ ExplainInput(
260
+ "valid_TP_sizes",
261
+ str(list(report.fleet.valid_tp_sizes)),
262
+ "[estimated]",
263
+ note="divisors of num_attention_heads capped at 8 (single node)",
264
+ ),
265
+ ExplainInput(
266
+ "GPU memory_gb",
267
+ f"{report.gpu_spec.memory_gb} GB",
268
+ "[verified]",
269
+ ),
270
+ ],
271
+ steps=steps,
272
+ result=f"{opt.gpu_count} GPUs, fit={opt.fits}",
273
+ source="vLLM --gpu-memory-utilization 0.9 convention; TP divisibility required by vLLM/SGLang",
274
+ methodology_anchor="#tp-aware-kv-sharding",
275
+ )
276
+ )
277
+
278
+
279
+ # ======================================================================
280
+ # Prefill
281
+ # ======================================================================
282
+
283
+
284
+ def _prefill(report: EvaluationReport, entries: list[ExplainEntry]) -> None:
285
+ if (
286
+ report.prefill is None
287
+ or report.gpu_spec is None
288
+ or report.fleet is None
289
+ or report.perf_input_tokens is None
290
+ ):
291
+ return
292
+ p = report.prefill
293
+ # Figure out chosen GPU count from the fleet
294
+ chosen = next(
295
+ (o.gpu_count for o in report.fleet.options if o.tier == report.fleet.best_tier),
296
+ report.fleet.options[0].gpu_count,
297
+ )
298
+ entries.append(
299
+ ExplainEntry(
300
+ heading="Prefill latency (single request)",
301
+ formula=(
302
+ "FLOPs = 2 × params × input_tokens\n"
303
+ "effective_TFLOPS = peak_fp16_TFLOPS × num_gpus × utilization\n"
304
+ "latency_ms = (FLOPs / (effective_TFLOPS × 1e12)) × 1000"
305
+ ),
306
+ inputs=[
307
+ ExplainInput(
308
+ "params",
309
+ f"{report.total_params_estimate.value:,}",
310
+ "[estimated]",
311
+ note="from architecture formula (see weight.py)",
312
+ ),
313
+ ExplainInput("input_tokens", f"{report.perf_input_tokens:,}", "[user-set]"),
314
+ ExplainInput(
315
+ "peak_fp16_TFLOPS",
316
+ f"{report.gpu_spec.fp16_tflops}",
317
+ "[verified]",
318
+ note=f"from GPU database, {report.gpu_spec.id} spec",
319
+ ),
320
+ ExplainInput("num_gpus", f"{chosen}", "[estimated]"),
321
+ ExplainInput(
322
+ "utilization",
323
+ f"{p.utilization:.2f}",
324
+ "[user-set]",
325
+ note="empirical MFU, default 0.40 — override with --prefill-util",
326
+ ),
327
+ ],
328
+ steps=[
329
+ f"FLOPs = 2 × {report.total_params_estimate.value:,} × "
330
+ f"{report.perf_input_tokens:,} = {p.total_flops.value:.3e}",
331
+ f"effective_TFLOPS = {report.gpu_spec.fp16_tflops} × {chosen} × "
332
+ f"{p.utilization:.2f} = {p.peak_effective_tflops.value:.1f}",
333
+ f"latency = {p.total_flops.value:.3e} / "
334
+ f"({p.peak_effective_tflops.value:.1f} × 1e12) × 1000 = "
335
+ f"{p.latency_ms.value:.1f} ms",
336
+ ],
337
+ result=f"{p.latency_ms.value:.1f} ms [{p.latency_ms.label.value}]",
338
+ source="Kaplan et al. 2020 'Scaling Laws for Neural Language Models' (arxiv.org/abs/2001.08361)",
339
+ methodology_anchor="#prefill-latency",
340
+ )
341
+ )
342
+
343
+
344
+ # ======================================================================
345
+ # Decode
346
+ # ======================================================================
347
+
348
+
349
+ def _decode(report: EvaluationReport, entries: list[ExplainEntry]) -> None:
350
+ if report.decode is None or report.gpu_spec is None or report.fleet is None:
351
+ return
352
+ d = report.decode
353
+ bw = report.gpu_spec.memory_bandwidth_gbps or 0
354
+ chosen = next(
355
+ (o.gpu_count for o in report.fleet.options if o.tier == report.fleet.best_tier),
356
+ report.fleet.options[0].gpu_count,
357
+ )
358
+ weight_per_gpu = d.active_weight_bytes_per_gpu.value
359
+ effective_bw_gbs = bw * d.bw_utilization
360
+ steps = [
361
+ f"weight_per_gpu = {report.weight.total_bytes.value:,} / {chosen} = "
362
+ f"{weight_per_gpu:,} bytes ({weight_per_gpu / 1e9:.2f} GB)",
363
+ f"effective_bw = {bw} × {d.bw_utilization:.2f} = {effective_bw_gbs:.0f} GB/s",
364
+ f"per_gpu_tok_per_sec = effective_bw / weight_per_gpu = "
365
+ f"{effective_bw_gbs * 1e9 / weight_per_gpu:.1f} tok/s",
366
+ f"cluster_tok_per_sec = per_gpu × {chosen} × "
367
+ f"{d.cluster_comm_efficiency:.2f} = {d.cluster_tokens_per_sec.value:.1f} tok/s",
368
+ ]
369
+ entries.append(
370
+ ExplainEntry(
371
+ heading="Decode throughput (cluster)",
372
+ formula=(
373
+ "per_gpu_tok_per_sec = memory_bandwidth × bw_util / weight_bytes_per_gpu\n"
374
+ "cluster_tok_per_sec = per_gpu × num_gpus × cluster_comm_efficiency"
375
+ ),
376
+ inputs=[
377
+ ExplainInput(
378
+ "GPU memory_bandwidth_gbps",
379
+ f"{bw}",
380
+ "[verified]",
381
+ note=f"from GPU database, {report.gpu_spec.id}",
382
+ ),
383
+ ExplainInput(
384
+ "bw_util",
385
+ f"{d.bw_utilization:.2f}",
386
+ "[user-set]",
387
+ note="empirical, default 0.50 — override with --decode-bw-util",
388
+ ),
389
+ ExplainInput("weight_bytes_per_gpu", f"{weight_per_gpu:,}", "[estimated]"),
390
+ ExplainInput("num_gpus", f"{chosen}", "[estimated]"),
391
+ ExplainInput(
392
+ "cluster_comm_efficiency",
393
+ f"{d.cluster_comm_efficiency:.2f}",
394
+ "[user-set]",
395
+ note="NCCL AllReduce efficiency on NVLink, default 0.90",
396
+ ),
397
+ ],
398
+ steps=steps,
399
+ result=f"{d.cluster_tokens_per_sec.value:.1f} tok/s [estimated]",
400
+ source="vLLM paper (Kwon et al. SOSP 2023, arxiv.org/abs/2309.06180)",
401
+ methodology_anchor="#decode-tokens-per-second",
402
+ )
403
+ )
404
+
405
+
406
+ # ======================================================================
407
+ # Concurrency bounds
408
+ # ======================================================================
409
+
410
+
411
+ def _concurrency(report: EvaluationReport, entries: list[ExplainEntry]) -> None:
412
+ if report.concurrency is None:
413
+ return
414
+ c = report.concurrency
415
+ entries.append(
416
+ ExplainEntry(
417
+ heading="K bound (memory capacity)",
418
+ formula="K = floor(per_GPU_headroom_bytes / per_GPU_kv_bytes_per_request)",
419
+ inputs=[
420
+ ExplainInput(
421
+ "per_GPU_headroom_bytes",
422
+ f"{c.k_source_headroom_bytes:,}",
423
+ "[estimated]",
424
+ ),
425
+ ExplainInput(
426
+ "per_GPU_kv_bytes_per_request",
427
+ f"{c.k_source_kv_per_req_bytes:,}",
428
+ "[estimated]",
429
+ note="post-TP-sharding via min(tp, num_kv_heads)",
430
+ ),
431
+ ],
432
+ steps=[
433
+ f"K = floor({c.k_source_headroom_bytes:,} / "
434
+ f"{c.k_source_kv_per_req_bytes:,}) = {c.k_bound.value}",
435
+ ],
436
+ result=f"K = {c.k_bound.value} [{c.k_bound.label.value}]",
437
+ source="TP sharding rule from vLLM source code (verified)",
438
+ methodology_anchor="#k-bound-memory-capacity",
439
+ )
440
+ )
441
+ l_tps = report.decode.cluster_tokens_per_sec.value if report.decode else 0
442
+ entries.append(
443
+ ExplainEntry(
444
+ heading="L bound (compute/bandwidth at SLA)",
445
+ formula=(
446
+ "L = floor(cluster_tok_per_sec / target_per_user_tok_per_sec / degradation_factor)"
447
+ ),
448
+ inputs=[
449
+ ExplainInput("cluster_tok_per_sec", f"{l_tps:.1f}", "[estimated]"),
450
+ ExplainInput(
451
+ "target_per_user_tok_per_sec",
452
+ f"{c.target_tokens_per_sec:.1f}",
453
+ "[user-set]",
454
+ note="SLA, override with --target-tokens-per-sec",
455
+ ),
456
+ ExplainInput(
457
+ "degradation_factor",
458
+ f"{c.degradation_factor:.2f}",
459
+ "[user-set]",
460
+ note="default 1.0 = no degradation; override with --concurrency-degradation",
461
+ ),
462
+ ],
463
+ steps=[
464
+ f"L = floor({l_tps:.1f} / {c.target_tokens_per_sec:.1f} / "
465
+ f"{c.degradation_factor:.2f}) = {c.l_bound.value}",
466
+ ],
467
+ result=f"L = {c.l_bound.value} [{c.l_bound.label.value}]",
468
+ source="Standard SLA-based capacity planning",
469
+ methodology_anchor="#l-bound-compute-bandwidth-at-sla",
470
+ )
471
+ )
472
+ entries.append(
473
+ ExplainEntry(
474
+ heading="Max concurrent + bottleneck verdict",
475
+ formula="max_concurrent = min(K, L); bottleneck = 'memory_capacity' if K ≤ L else 'memory_bandwidth / compute'",
476
+ inputs=[
477
+ ExplainInput("K", str(c.k_bound.value), f"[{c.k_bound.label.value}]"),
478
+ ExplainInput("L", str(c.l_bound.value), f"[{c.l_bound.label.value}]"),
479
+ ],
480
+ steps=[
481
+ f"max_concurrent = min(K={c.k_bound.value}, L={c.l_bound.value}) = "
482
+ f"{c.max_concurrent.value}",
483
+ f"bottleneck = {c.bottleneck}",
484
+ ],
485
+ result=(f"{c.max_concurrent.value} concurrent, bottleneck = {c.bottleneck}"),
486
+ source=c.bottleneck_reason_en,
487
+ methodology_anchor="#concurrency-bounds-k-l",
488
+ )
489
+ )
490
+ # Sanity check to silence "unused math import" if no steps triggered math.
491
+ _ = math.floor(0)
492
+
493
+
494
+ # ======================================================================
495
+ # Helpers
496
+ # ======================================================================
497
+
498
+
499
+ def _fmt_ctx(ctx: int) -> str:
500
+ if ctx >= 1_000_000:
501
+ return f"{ctx // 1_000_000}M"
502
+ if ctx >= 1024:
503
+ return f"{ctx // 1024}K"
504
+ return str(ctx)
src/llm_cal/engine_compat/__init__.py ADDED
File without changes
src/llm_cal/engine_compat/loader.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Engine compatibility matrix loader + match function."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from functools import lru_cache
6
+ from importlib.resources import files
7
+ from pathlib import Path
8
+ from typing import Literal
9
+
10
+ from packaging.specifiers import InvalidSpecifier, SpecifierSet
11
+ from packaging.version import InvalidVersion, Version
12
+ from pydantic import BaseModel, Field
13
+
14
+ from llm_cal.common.yaml_loader import load_yaml
15
+
16
+ SupportLevel = Literal["full", "partial", "broken", "unverified"]
17
+ VerificationLevel = Literal["verified", "cited", "unverified"]
18
+
19
+
20
+ class EngineFlag(BaseModel):
21
+ flag: str
22
+ value: str | None = None
23
+ note_en: str | None = None
24
+ note_zh: str | None = None
25
+
26
+
27
+ class EngineSource(BaseModel):
28
+ type: str # release_notes | announcement | pr | tested
29
+ url: str | None = None
30
+ captured_date: str | None = None
31
+ note_en: str | None = None
32
+ note_zh: str | None = None
33
+ # `tested` specific fields (may be absent on other types)
34
+ tester: str | None = None
35
+ date: str | None = None
36
+ hardware: str | None = None
37
+
38
+
39
+ class EngineCompatEntry(BaseModel):
40
+ engine: Literal["vllm", "sglang"]
41
+ version_spec: str # e.g. ">=0.19.0"
42
+ matches_model_type: str
43
+ support: SupportLevel
44
+ verification_level: VerificationLevel
45
+ required_flags: list[EngineFlag] = Field(default_factory=list)
46
+ optional_flags: list[EngineFlag] = Field(default_factory=list)
47
+ sources: list[EngineSource] = Field(default_factory=list)
48
+ caveats_en: list[str] = Field(default_factory=list)
49
+ caveats_zh: list[str] = Field(default_factory=list)
50
+
51
+
52
+ class EngineCompatMatrix(BaseModel):
53
+ schema_version: int
54
+ entries: list[EngineCompatEntry]
55
+
56
+
57
+ def _default_path() -> Path:
58
+ return Path(str(files("llm_cal.engine_compat").joinpath("matrix.yaml")))
59
+
60
+
61
+ @lru_cache(maxsize=1)
62
+ def load_matrix(path: Path | None = None) -> EngineCompatMatrix:
63
+ return load_yaml(path or _default_path(), EngineCompatMatrix)
64
+
65
+
66
+ def find_match(
67
+ engine: str,
68
+ model_type: str,
69
+ version: str | None = None,
70
+ matrix: EngineCompatMatrix | None = None,
71
+ ) -> EngineCompatEntry | None:
72
+ """Find the highest-version matching entry for (engine, model_type).
73
+
74
+ If `version` is None, we return the broadest entry (any version matching
75
+ model_type on the given engine). If `version` is given, we filter to entries
76
+ whose version_spec covers it.
77
+ """
78
+ m = matrix or load_matrix()
79
+ engine_norm = engine.lower().strip()
80
+ model_type_norm = model_type.lower().strip()
81
+
82
+ candidates = [
83
+ e for e in m.entries if e.engine == engine_norm and e.matches_model_type == model_type_norm
84
+ ]
85
+ if not candidates:
86
+ return None
87
+
88
+ if version is None:
89
+ # Return the entry with the "highest lower bound" as the most relevant
90
+ return max(candidates, key=_lower_bound_key)
91
+
92
+ try:
93
+ v = Version(version)
94
+ except InvalidVersion:
95
+ return candidates[0]
96
+
97
+ for entry in candidates:
98
+ try:
99
+ if v in SpecifierSet(entry.version_spec):
100
+ return entry
101
+ except InvalidSpecifier:
102
+ continue
103
+ return None
104
+
105
+
106
+ def _lower_bound_key(entry: EngineCompatEntry) -> Version:
107
+ """Extract the lowest version a spec matches (approximate, used only for sort)."""
108
+ try:
109
+ spec = SpecifierSet(entry.version_spec)
110
+ except InvalidSpecifier:
111
+ return Version("0.0.0")
112
+ for single in spec:
113
+ if single.operator in (">=", "==", ">"):
114
+ try:
115
+ return Version(single.version)
116
+ except InvalidVersion:
117
+ continue
118
+ return Version("0.0.0")
src/llm_cal/engine_compat/matrix.yaml ADDED
@@ -0,0 +1,512 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Engine compatibility matrix — v0.1 initial entries.
2
+ #
3
+ # VERIFICATION LEVELS:
4
+ # verified = actually tested by someone with hardware (requires type=tested source)
5
+ # >>> v0.1 has ZERO `verified` entries — author has no test hardware <<<
6
+ # cited = evidence exists (release note / PR / announcement) but not tested by us
7
+ # unverified = no sources, just an educated guess
8
+ #
9
+ # The tool ALWAYS surfaces verification_level in output. Users never see a green
10
+ # checkmark on an unverified row.
11
+ schema_version: 2
12
+ entries:
13
+ # ============================================================
14
+ # vLLM
15
+ # ============================================================
16
+ - engine: vllm
17
+ version_spec: ">=0.19.0"
18
+ matches_model_type: deepseek_v4
19
+ support: full
20
+ verification_level: cited
21
+ required_flags: []
22
+ optional_flags:
23
+ - flag: "--attention-backend"
24
+ value: "auto"
25
+ note_en: "Picks CSA+HCA backend automatically."
26
+ note_zh: "自动选择 CSA+HCA 注意力后端。"
27
+ sources:
28
+ - type: release_notes
29
+ url: "https://github.com/vllm-project/vllm/releases/tag/v0.19.0"
30
+ captured_date: "2026-04-23"
31
+ - type: announcement
32
+ url: "https://x.com/vllm_project/status/2047520252851105796"
33
+ captured_date: "2026-04-23"
34
+ note_en: "Day-0 support announcement."
35
+ note_zh: "Day-0 支持公告。"
36
+ caveats_en:
37
+ - "H800 MoE all-to-all is bottlenecked by halved NVLink; throughput lower than H100."
38
+ - "1M context requires --max-model-len 1048576 + --gpu-memory-utilization 0.9."
39
+ caveats_zh:
40
+ - "H800 的 MoE all-to-all 受限于减半的 NVLink,吞吐明显低于 H100。"
41
+ - "1M 上下文需要 --max-model-len 1048576 + --gpu-memory-utilization 0.9。"
42
+
43
+ - engine: vllm
44
+ version_spec: ">=0.18.0,<0.19.0"
45
+ matches_model_type: deepseek_v3_2
46
+ support: full
47
+ verification_level: cited
48
+ required_flags:
49
+ - flag: "--attention-backend"
50
+ value: "nsa"
51
+ optional_flags: []
52
+ sources:
53
+ - type: release_notes
54
+ url: "https://docs.vllm.ai/projects/recipes/en/latest/DeepSeek/DeepSeek-V3_2.html"
55
+ captured_date: "2026-04-24"
56
+ caveats_en:
57
+ - "TP=8 padding overhead: 128 attention heads / 8 = 16 per rank but padded to 64."
58
+ caveats_zh:
59
+ - "TP=8 存在 padding 开销:128 个头 / 8 = 16 头/rank,但填充到 64。建议 TP=2 + DP/EP。"
60
+
61
+ - engine: vllm
62
+ version_spec: ">=0.7.0"
63
+ matches_model_type: deepseek_v3
64
+ support: full
65
+ verification_level: cited
66
+ required_flags: []
67
+ optional_flags:
68
+ - flag: "--trust-remote-code"
69
+ value: null
70
+ note_en: "Required for custom DeepSeek modeling code."
71
+ note_zh: "用于加载 DeepSeek 的自定义建模代码。"
72
+ sources:
73
+ - type: release_notes
74
+ url: "https://docs.vllm.ai/projects/recipes/en/latest/DeepSeek/DeepSeek-V3.html"
75
+ captured_date: "2026-04-24"
76
+ caveats_en: []
77
+ caveats_zh: []
78
+
79
+ - engine: vllm
80
+ version_spec: ">=0.6.0"
81
+ matches_model_type: llama
82
+ support: full
83
+ verification_level: cited
84
+ required_flags: []
85
+ optional_flags: []
86
+ sources:
87
+ - type: release_notes
88
+ url: "https://docs.vllm.ai/"
89
+ captured_date: "2026-04-24"
90
+ caveats_en: []
91
+ caveats_zh: []
92
+
93
+ - engine: vllm
94
+ version_spec: ">=0.7.0"
95
+ matches_model_type: qwen3
96
+ support: full
97
+ verification_level: cited
98
+ required_flags: []
99
+ optional_flags: []
100
+ sources:
101
+ - type: release_notes
102
+ url: "https://docs.vllm.ai/"
103
+ captured_date: "2026-04-24"
104
+ caveats_en: []
105
+ caveats_zh: []
106
+
107
+ - engine: vllm
108
+ version_spec: ">=0.7.0"
109
+ matches_model_type: qwen3_moe
110
+ support: full
111
+ verification_level: cited
112
+ required_flags: []
113
+ optional_flags:
114
+ - flag: "--enable-expert-parallel"
115
+ value: null
116
+ note_en: "Enables DP+EP for MoE all-to-all distribution."
117
+ note_zh: "启用 DP+EP,对 MoE all-to-all 通信更友好。"
118
+ sources:
119
+ - type: release_notes
120
+ url: "https://docs.vllm.ai/en/latest/serving/expert_parallel_deployment/"
121
+ captured_date: "2026-04-24"
122
+ caveats_en: []
123
+ caveats_zh: []
124
+
125
+ - engine: vllm
126
+ version_spec: ">=0.6.0"
127
+ matches_model_type: mixtral
128
+ support: full
129
+ verification_level: cited
130
+ required_flags: []
131
+ optional_flags: []
132
+ sources:
133
+ - type: release_notes
134
+ url: "https://docs.vllm.ai/"
135
+ captured_date: "2026-04-24"
136
+ caveats_en: []
137
+ caveats_zh: []
138
+
139
+ - engine: vllm
140
+ version_spec: ">=0.6.0"
141
+ matches_model_type: mistral
142
+ support: full
143
+ verification_level: cited
144
+ required_flags: []
145
+ optional_flags: []
146
+ sources:
147
+ - type: release_notes
148
+ url: "https://docs.vllm.ai/"
149
+ captured_date: "2026-04-24"
150
+ caveats_en: []
151
+ caveats_zh: []
152
+
153
+ - engine: vllm
154
+ version_spec: ">=0.6.0"
155
+ matches_model_type: qwen2
156
+ support: full
157
+ verification_level: cited
158
+ required_flags: []
159
+ optional_flags: []
160
+ sources:
161
+ - type: release_notes
162
+ url: "https://docs.vllm.ai/en/latest/models/supported_models.html"
163
+ captured_date: "2026-04-24"
164
+ caveats_en: []
165
+ caveats_zh: []
166
+
167
+ - engine: vllm
168
+ version_spec: ">=0.6.0"
169
+ matches_model_type: qwen2_moe
170
+ support: full
171
+ verification_level: cited
172
+ required_flags: []
173
+ optional_flags:
174
+ - flag: "--enable-expert-parallel"
175
+ value: null
176
+ note_en: "Enables DP+EP for MoE all-to-all distribution."
177
+ note_zh: "启用 DP+EP,对 MoE all-to-all 通信更友好。"
178
+ sources:
179
+ - type: release_notes
180
+ url: "https://docs.vllm.ai/en/latest/serving/expert_parallel_deployment/"
181
+ captured_date: "2026-04-24"
182
+ caveats_en: []
183
+ caveats_zh: []
184
+
185
+ - engine: vllm
186
+ version_spec: ">=0.5.0"
187
+ matches_model_type: gemma
188
+ support: full
189
+ verification_level: cited
190
+ required_flags: []
191
+ optional_flags: []
192
+ sources:
193
+ - type: release_notes
194
+ url: "https://docs.vllm.ai/en/latest/models/supported_models.html"
195
+ captured_date: "2026-04-24"
196
+ caveats_en:
197
+ - "Gemma uses tied embeddings — output head shares embedding weights."
198
+ caveats_zh:
199
+ - "Gemma 使用权重绑定的 embedding(tied embeddings),输出头与 embedding 共享权重。"
200
+
201
+ - engine: vllm
202
+ version_spec: ">=0.6.0"
203
+ matches_model_type: gemma2
204
+ support: full
205
+ verification_level: cited
206
+ required_flags: []
207
+ optional_flags: []
208
+ sources:
209
+ - type: release_notes
210
+ url: "https://docs.vllm.ai/en/latest/models/supported_models.html"
211
+ captured_date: "2026-04-24"
212
+ caveats_en: []
213
+ caveats_zh: []
214
+
215
+ - engine: vllm
216
+ version_spec: ">=0.7.0"
217
+ matches_model_type: gemma3
218
+ support: full
219
+ verification_level: cited
220
+ required_flags: []
221
+ optional_flags: []
222
+ sources:
223
+ - type: release_notes
224
+ url: "https://docs.vllm.ai/en/latest/models/supported_models.html"
225
+ captured_date: "2026-04-24"
226
+ caveats_en:
227
+ - "Gemma 3 adds vision modality — v0.1 of llm-cal treats it as text-only for now."
228
+ caveats_zh:
229
+ - "Gemma 3 新增视觉多模态能力,llm-cal v0.1 当作纯文本模型处理。"
230
+
231
+ - engine: vllm
232
+ version_spec: ">=0.5.0"
233
+ matches_model_type: phi
234
+ support: full
235
+ verification_level: cited
236
+ required_flags: []
237
+ optional_flags: []
238
+ sources:
239
+ - type: release_notes
240
+ url: "https://docs.vllm.ai/en/latest/models/supported_models.html"
241
+ captured_date: "2026-04-24"
242
+ caveats_en: []
243
+ caveats_zh: []
244
+
245
+ - engine: vllm
246
+ version_spec: ">=0.5.0"
247
+ matches_model_type: phi3
248
+ support: full
249
+ verification_level: cited
250
+ required_flags: []
251
+ optional_flags: []
252
+ sources:
253
+ - type: release_notes
254
+ url: "https://docs.vllm.ai/en/latest/models/supported_models.html"
255
+ captured_date: "2026-04-24"
256
+ caveats_en: []
257
+ caveats_zh: []
258
+
259
+ - engine: vllm
260
+ version_spec: ">=0.6.0"
261
+ matches_model_type: deepseek_v2
262
+ support: full
263
+ verification_level: cited
264
+ required_flags: []
265
+ optional_flags:
266
+ - flag: "--trust-remote-code"
267
+ value: null
268
+ note_en: "Required for DeepSeek V2 custom modeling code."
269
+ note_zh: "加载 DeepSeek V2 的自定义建模代码。"
270
+ sources:
271
+ - type: release_notes
272
+ url: "https://docs.vllm.ai/en/latest/models/supported_models.html"
273
+ captured_date: "2026-04-24"
274
+ caveats_en: []
275
+ caveats_zh: []
276
+
277
+ # ============================================================
278
+ # SGLang
279
+ # ============================================================
280
+ - engine: sglang
281
+ version_spec: ">=0.5.0"
282
+ matches_model_type: deepseek_v3_2
283
+ support: full
284
+ verification_level: cited
285
+ required_flags:
286
+ - flag: "--attention-backend"
287
+ value: "nsa"
288
+ optional_flags: []
289
+ sources:
290
+ - type: release_notes
291
+ url: "https://docs.sglang.io/advanced_features/attention_backend.html"
292
+ captured_date: "2026-04-24"
293
+ - type: announcement
294
+ url: "https://www.lmsys.org/blog/2025-09-29-deepseek-V32/"
295
+ captured_date: "2025-09-29"
296
+ note_en: "Day-0 V3.2 support announcement."
297
+ note_zh: "V3.2 的 Day-0 支持公告。"
298
+ caveats_en: []
299
+ caveats_zh: []
300
+
301
+ - engine: sglang
302
+ version_spec: ">=0.5.0"
303
+ matches_model_type: deepseek_v4
304
+ support: unverified
305
+ verification_level: unverified
306
+ required_flags: []
307
+ optional_flags: []
308
+ sources: []
309
+ caveats_en:
310
+ - "As of 2026-04-24, no Day-0 announcement for V4. DSA/NSA infrastructure exists (V3.2), expected to extend."
311
+ caveats_zh:
312
+ - "截至 2026-04-24,尚无 V4 的 Day-0 公告。已有 V3.2 的 DSA/NSA 基础设施,预期会扩展支持。"
313
+
314
+ - engine: sglang
315
+ version_spec: ">=0.4.0"
316
+ matches_model_type: deepseek_v3
317
+ support: full
318
+ verification_level: cited
319
+ required_flags: []
320
+ optional_flags: []
321
+ sources:
322
+ - type: release_notes
323
+ url: "https://github.com/sgl-project/sglang"
324
+ captured_date: "2026-04-24"
325
+ caveats_en: []
326
+ caveats_zh: []
327
+
328
+ - engine: sglang
329
+ version_spec: ">=0.4.0"
330
+ matches_model_type: llama
331
+ support: full
332
+ verification_level: cited
333
+ required_flags: []
334
+ optional_flags: []
335
+ sources:
336
+ - type: release_notes
337
+ url: "https://github.com/sgl-project/sglang"
338
+ captured_date: "2026-04-24"
339
+ caveats_en: []
340
+ caveats_zh: []
341
+
342
+ - engine: sglang
343
+ version_spec: ">=0.4.0"
344
+ matches_model_type: qwen3
345
+ support: full
346
+ verification_level: cited
347
+ required_flags: []
348
+ optional_flags: []
349
+ sources:
350
+ - type: release_notes
351
+ url: "https://docs.sglang.io/basic_usage/popular_model_usage.html"
352
+ captured_date: "2026-04-24"
353
+ caveats_en: []
354
+ caveats_zh: []
355
+
356
+ - engine: sglang
357
+ version_spec: ">=0.4.0"
358
+ matches_model_type: mixtral
359
+ support: full
360
+ verification_level: cited
361
+ required_flags: []
362
+ optional_flags: []
363
+ sources:
364
+ - type: release_notes
365
+ url: "https://github.com/sgl-project/sglang"
366
+ captured_date: "2026-04-24"
367
+ caveats_en: []
368
+ caveats_zh: []
369
+
370
+ - engine: sglang
371
+ version_spec: ">=0.4.0"
372
+ matches_model_type: qwen2
373
+ support: full
374
+ verification_level: cited
375
+ required_flags: []
376
+ optional_flags: []
377
+ sources:
378
+ - type: release_notes
379
+ url: "https://docs.sglang.io/basic_usage/popular_model_usage.html"
380
+ captured_date: "2026-04-24"
381
+ caveats_en: []
382
+ caveats_zh: []
383
+
384
+ - engine: sglang
385
+ version_spec: ">=0.4.0"
386
+ matches_model_type: qwen2_moe
387
+ support: full
388
+ verification_level: cited
389
+ required_flags: []
390
+ optional_flags: []
391
+ sources:
392
+ - type: release_notes
393
+ url: "https://docs.sglang.io/basic_usage/popular_model_usage.html"
394
+ captured_date: "2026-04-24"
395
+ caveats_en: []
396
+ caveats_zh: []
397
+
398
+ - engine: sglang
399
+ version_spec: ">=0.4.0"
400
+ matches_model_type: qwen3_moe
401
+ support: full
402
+ verification_level: cited
403
+ required_flags: []
404
+ optional_flags: []
405
+ sources:
406
+ - type: release_notes
407
+ url: "https://docs.sglang.io/basic_usage/popular_model_usage.html"
408
+ captured_date: "2026-04-24"
409
+ caveats_en: []
410
+ caveats_zh: []
411
+
412
+ - engine: sglang
413
+ version_spec: ">=0.4.0"
414
+ matches_model_type: mistral
415
+ support: full
416
+ verification_level: cited
417
+ required_flags: []
418
+ optional_flags: []
419
+ sources:
420
+ - type: release_notes
421
+ url: "https://github.com/sgl-project/sglang"
422
+ captured_date: "2026-04-24"
423
+ caveats_en: []
424
+ caveats_zh: []
425
+
426
+ - engine: sglang
427
+ version_spec: ">=0.4.0"
428
+ matches_model_type: gemma
429
+ support: full
430
+ verification_level: cited
431
+ required_flags: []
432
+ optional_flags: []
433
+ sources:
434
+ - type: release_notes
435
+ url: "https://docs.sglang.io/basic_usage/popular_model_usage.html"
436
+ captured_date: "2026-04-24"
437
+ caveats_en: []
438
+ caveats_zh: []
439
+
440
+ - engine: sglang
441
+ version_spec: ">=0.4.0"
442
+ matches_model_type: gemma2
443
+ support: full
444
+ verification_level: cited
445
+ required_flags: []
446
+ optional_flags: []
447
+ sources:
448
+ - type: release_notes
449
+ url: "https://docs.sglang.io/basic_usage/popular_model_usage.html"
450
+ captured_date: "2026-04-24"
451
+ caveats_en: []
452
+ caveats_zh: []
453
+
454
+ - engine: sglang
455
+ version_spec: ">=0.5.0"
456
+ matches_model_type: gemma3
457
+ support: full
458
+ verification_level: cited
459
+ required_flags: []
460
+ optional_flags: []
461
+ sources:
462
+ - type: release_notes
463
+ url: "https://docs.sglang.io/basic_usage/popular_model_usage.html"
464
+ captured_date: "2026-04-24"
465
+ caveats_en: []
466
+ caveats_zh: []
467
+
468
+ - engine: sglang
469
+ version_spec: ">=0.4.0"
470
+ matches_model_type: phi
471
+ support: full
472
+ verification_level: cited
473
+ required_flags: []
474
+ optional_flags: []
475
+ sources:
476
+ - type: release_notes
477
+ url: "https://docs.sglang.io/basic_usage/popular_model_usage.html"
478
+ captured_date: "2026-04-24"
479
+ caveats_en: []
480
+ caveats_zh: []
481
+
482
+ - engine: sglang
483
+ version_spec: ">=0.4.0"
484
+ matches_model_type: phi3
485
+ support: full
486
+ verification_level: cited
487
+ required_flags: []
488
+ optional_flags: []
489
+ sources:
490
+ - type: release_notes
491
+ url: "https://docs.sglang.io/basic_usage/popular_model_usage.html"
492
+ captured_date: "2026-04-24"
493
+ caveats_en: []
494
+ caveats_zh: []
495
+
496
+ - engine: sglang
497
+ version_spec: ">=0.4.0"
498
+ matches_model_type: deepseek_v2
499
+ support: full
500
+ verification_level: cited
501
+ required_flags: []
502
+ optional_flags:
503
+ - flag: "--trust-remote-code"
504
+ value: null
505
+ note_en: "Required for DeepSeek V2 custom modeling code."
506
+ note_zh: "加载 DeepSeek V2 的自定义建模代码。"
507
+ sources:
508
+ - type: release_notes
509
+ url: "https://github.com/sgl-project/sglang"
510
+ captured_date: "2026-04-24"
511
+ caveats_en: []
512
+ caveats_zh: []
src/llm_cal/fleet/__init__.py ADDED
File without changes
src/llm_cal/fleet/planner.py ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Fleet planner — reverse-inference of "how many GPUs do I need".
2
+
3
+ Three tiers:
4
+ * min — just enough to hold weights + light overhead
5
+ (can run single requests at short context)
6
+ * dev — room for ~8 concurrent at 128K context
7
+ * prod — room for ~16 concurrent at 128K context
8
+
9
+ TP-divisibility constraint (CRITICAL regression test): the number of attention
10
+ heads must be divisible by the number of GPUs. vLLM/SGLang with TP=3 on a
11
+ 64-head model would fail to start; we only recommend counts in the valid set.
12
+
13
+ Reserved overhead per GPU = 10% of HBM (CUDA context + activations + framework),
14
+ which matches `--gpu-memory-utilization 0.9` in vLLM.
15
+
16
+ Per-GPU KV modeling is TP-aware:
17
+
18
+ per_gpu_KV = total_KV / min(tp_size, max(1, num_kv_heads))
19
+
20
+ * MQA (kv_heads=1): KV replicates fully across ranks → divisor is 1,
21
+ per-GPU KV = total (accurate for DeepSeek V4-Flash, Qwen MQA variants).
22
+ * GQA (kv_heads=8): KV splits across ranks up to num_kv_heads → at TP=8,
23
+ per-GPU KV = total/8 (accurate for Llama 3 70B, Qwen 72B).
24
+ * MHA: splits fully up to num_heads.
25
+
26
+ This matches vLLM/SGLang's actual sharding behavior. MLA-latent KV is
27
+ technically replicated in most frameworks, but since num_kv_heads is
28
+ typically 1 in MLA (DeepSeek V2/V3/V4), the formula degenerates to
29
+ replication anyway.
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import math
35
+ from dataclasses import dataclass
36
+ from typing import Literal
37
+
38
+ from llm_cal.architecture.profile import ArchitectureProfile
39
+ from llm_cal.hardware.loader import GPUSpec
40
+
41
+ Tier = Literal["min", "dev", "prod"]
42
+
43
+ _OVERHEAD_FRACTION = 0.10
44
+ _KV_HEAD_ROOM_CONCURRENT: dict[Tier, int] = {
45
+ "min": 1, # one request worth of KV at 128K
46
+ "dev": 8,
47
+ "prod": 16,
48
+ }
49
+ # For recommendation logic, compute per-GPU fit at this reference context length.
50
+ _REFERENCE_CTX_TOKENS = 131_072
51
+ # Max recommended TP within a single 8-GPU node. Beyond this we'd want PP/EP,
52
+ # which is out of v0.1 scope.
53
+ _MAX_TP_SINGLE_NODE = 8
54
+
55
+
56
+ @dataclass(frozen=True)
57
+ class FleetOption:
58
+ tier: Tier
59
+ gpu_count: int
60
+ weight_bytes_per_gpu: int
61
+ kv_bytes_per_request: int # at reference context (128K)
62
+ max_concurrent_at_reference_ctx: int
63
+ # concurrency ceiling at each context length the user asked about.
64
+ # Key = context token count, value = max concurrent requests that fit.
65
+ max_concurrent_by_context: tuple[tuple[int, int], ...]
66
+ usable_bytes_per_gpu: int
67
+ fits: bool # False => the best we can do still overflows headroom at reference ctx
68
+ reason_en: str
69
+ reason_zh: str
70
+
71
+
72
+ @dataclass(frozen=True)
73
+ class FleetRecommendation:
74
+ options: tuple[FleetOption, ...]
75
+ best_tier: Tier
76
+ valid_tp_sizes: tuple[int, ...]
77
+ constraint_note_en: str
78
+ constraint_note_zh: str
79
+
80
+
81
+ def plan(
82
+ profile: ArchitectureProfile,
83
+ weight_bytes: int,
84
+ kv_bytes_per_request_at_ref: int,
85
+ gpu: GPUSpec,
86
+ forced_gpu_count: int | None = None,
87
+ kv_bytes_by_context: dict[int, int] | None = None,
88
+ ) -> FleetRecommendation:
89
+ """Recommend GPU counts for the three tiers, or a single option when forced.
90
+
91
+ `kv_bytes_by_context` is optional metadata used only for the per-option
92
+ concurrency breakdown (e.g. "~23 concurrent @ 128K, ~2 @ 1M"). Tier-fit
93
+ decisions still use `kv_bytes_per_request_at_ref` (the reference context).
94
+ """
95
+ kv_by_ctx = kv_bytes_by_context or {}
96
+ bytes_per_gpu_total = gpu.memory_gb * 1_000_000_000
97
+ usable_per_gpu = int(bytes_per_gpu_total * (1 - _OVERHEAD_FRACTION))
98
+ valid_tp = _valid_tp_sizes(profile)
99
+
100
+ constraint_en = _constraint_note_en(profile, valid_tp)
101
+ constraint_zh = _constraint_note_zh(profile, valid_tp)
102
+
103
+ if forced_gpu_count is not None:
104
+ option = _evaluate_count(
105
+ forced_gpu_count,
106
+ profile=profile,
107
+ weight_bytes=weight_bytes,
108
+ kv_bytes=kv_bytes_per_request_at_ref,
109
+ usable_per_gpu=usable_per_gpu,
110
+ valid_tp=valid_tp,
111
+ tier="dev", # generic label when user forced
112
+ kv_by_context=kv_by_ctx,
113
+ )
114
+ return FleetRecommendation(
115
+ options=(option,),
116
+ best_tier="dev",
117
+ valid_tp_sizes=tuple(valid_tp),
118
+ constraint_note_en=constraint_en,
119
+ constraint_note_zh=constraint_zh,
120
+ )
121
+
122
+ options: list[FleetOption] = []
123
+ for tier in ("min", "dev", "prod"):
124
+ gpu_count = _smallest_fitting_count(
125
+ valid_tp,
126
+ profile=profile,
127
+ weight_bytes=weight_bytes,
128
+ kv_bytes=kv_bytes_per_request_at_ref,
129
+ usable_per_gpu=usable_per_gpu,
130
+ concurrent=_KV_HEAD_ROOM_CONCURRENT[tier],
131
+ )
132
+ # Fall back to the largest TP if nothing fits — flagged as `fits=False`.
133
+ chosen = gpu_count if gpu_count is not None else max(valid_tp)
134
+ option = _evaluate_count(
135
+ chosen,
136
+ profile=profile,
137
+ weight_bytes=weight_bytes,
138
+ kv_bytes=kv_bytes_per_request_at_ref,
139
+ usable_per_gpu=usable_per_gpu,
140
+ valid_tp=valid_tp,
141
+ tier=tier,
142
+ kv_by_context=kv_by_ctx,
143
+ )
144
+ options.append(option)
145
+
146
+ # Best tier: dev if it fits, otherwise min, otherwise whatever exists
147
+ best = "dev" if options[1].fits else ("min" if options[0].fits else "prod")
148
+ return FleetRecommendation(
149
+ options=tuple(options),
150
+ best_tier=best, # type: ignore[arg-type]
151
+ valid_tp_sizes=tuple(valid_tp),
152
+ constraint_note_en=constraint_en,
153
+ constraint_note_zh=constraint_zh,
154
+ )
155
+
156
+
157
+ def _valid_tp_sizes(profile: ArchitectureProfile) -> list[int]:
158
+ """Divisors of num_heads, capped at the single-node maximum."""
159
+ if profile.attention is None or profile.attention.num_heads <= 0:
160
+ return [1]
161
+ h = profile.attention.num_heads
162
+ divisors = [i for i in range(1, min(h, _MAX_TP_SINGLE_NODE) + 1) if h % i == 0]
163
+ return divisors or [1]
164
+
165
+
166
+ def _kv_shards(profile: ArchitectureProfile, tp_size: int) -> int:
167
+ """How many ways KV cache can be split across TP ranks.
168
+
169
+ Saturates at num_kv_heads: once tp_size > num_kv_heads, extra ranks
170
+ just replicate, so the divisor stops growing.
171
+ """
172
+ if profile.attention is None:
173
+ return 1
174
+ kv_heads = max(1, profile.attention.num_kv_heads)
175
+ return min(tp_size, kv_heads)
176
+
177
+
178
+ def _smallest_fitting_count(
179
+ valid_tp: list[int],
180
+ *,
181
+ profile: ArchitectureProfile,
182
+ weight_bytes: int,
183
+ kv_bytes: int,
184
+ usable_per_gpu: int,
185
+ concurrent: int,
186
+ ) -> int | None:
187
+ for n in valid_tp:
188
+ if _fits(n, profile, weight_bytes, kv_bytes, usable_per_gpu, concurrent):
189
+ return n
190
+ return None
191
+
192
+
193
+ def _fits(
194
+ gpu_count: int,
195
+ profile: ArchitectureProfile,
196
+ weight_bytes: int,
197
+ kv_bytes: int,
198
+ usable_per_gpu: int,
199
+ concurrent: int,
200
+ ) -> bool:
201
+ weight_per_gpu = math.ceil(weight_bytes / gpu_count)
202
+ shards = _kv_shards(profile, gpu_count)
203
+ kv_per_gpu = math.ceil(kv_bytes / shards)
204
+ needed = weight_per_gpu + concurrent * kv_per_gpu
205
+ return needed <= usable_per_gpu
206
+
207
+
208
+ def _evaluate_count(
209
+ gpu_count: int,
210
+ *,
211
+ profile: ArchitectureProfile,
212
+ weight_bytes: int,
213
+ kv_bytes: int,
214
+ usable_per_gpu: int,
215
+ valid_tp: list[int],
216
+ tier: Tier,
217
+ kv_by_context: dict[int, int],
218
+ ) -> FleetOption:
219
+ weight_per_gpu = math.ceil(weight_bytes / gpu_count)
220
+ shards = _kv_shards(profile, gpu_count)
221
+ kv_per_gpu = math.ceil(kv_bytes / shards)
222
+ headroom = usable_per_gpu - weight_per_gpu
223
+ max_concurrent = max(0, headroom // kv_per_gpu) if kv_per_gpu > 0 else 0
224
+ # Per-context concurrency, sorted by context length ascending, each using
225
+ # the TP-sharded per-GPU KV.
226
+ max_concurrent_by_ctx = tuple(
227
+ (
228
+ ctx,
229
+ (max(0, headroom // math.ceil(kv / shards)) if kv > 0 else 0),
230
+ )
231
+ for ctx, kv in sorted(kv_by_context.items())
232
+ )
233
+ fits = _fits(
234
+ gpu_count,
235
+ profile,
236
+ weight_bytes,
237
+ kv_bytes,
238
+ usable_per_gpu,
239
+ _KV_HEAD_ROOM_CONCURRENT[tier],
240
+ )
241
+
242
+ # Reason strings
243
+ if gpu_count not in valid_tp:
244
+ reason_en = (
245
+ f"GPU count {gpu_count} does not divide num_heads — valid TP sizes: {sorted(valid_tp)}"
246
+ )
247
+ reason_zh = f"GPU 张数 {gpu_count} 无法整除注意力头数——有效 TP 张数:{sorted(valid_tp)}"
248
+ elif not fits:
249
+ reason_en = (
250
+ f"Weights + {_KV_HEAD_ROOM_CONCURRENT[tier]}x KV would exceed "
251
+ f"{usable_per_gpu / 1e9:.1f} GB usable per GPU"
252
+ )
253
+ reason_zh = (
254
+ f"权重 + {_KV_HEAD_ROOM_CONCURRENT[tier]} 份 KV 超过单卡可用的 "
255
+ f"{usable_per_gpu / 1e9:.1f} GB"
256
+ )
257
+ else:
258
+ reason_en = f"fits ~{max_concurrent} concurrent @ {_REFERENCE_CTX_TOKENS // 1024}K ctx"
259
+ reason_zh = f"可容纳约 {max_concurrent} 并发请求 @ {_REFERENCE_CTX_TOKENS // 1024}K 上下文"
260
+
261
+ return FleetOption(
262
+ tier=tier,
263
+ gpu_count=gpu_count,
264
+ weight_bytes_per_gpu=weight_per_gpu,
265
+ kv_bytes_per_request=kv_bytes,
266
+ max_concurrent_at_reference_ctx=max_concurrent,
267
+ max_concurrent_by_context=max_concurrent_by_ctx,
268
+ usable_bytes_per_gpu=usable_per_gpu,
269
+ fits=fits,
270
+ reason_en=reason_en,
271
+ reason_zh=reason_zh,
272
+ )
273
+
274
+
275
+ def _constraint_note_en(profile: ArchitectureProfile, valid_tp: list[int]) -> str:
276
+ heads = profile.attention.num_heads if profile.attention else 0
277
+ return f"TP must divide num_heads={heads}. Candidates within one node (<=8 GPUs): {valid_tp}."
278
+
279
+
280
+ def _constraint_note_zh(profile: ArchitectureProfile, valid_tp: list[int]) -> str:
281
+ heads = profile.attention.num_heads if profile.attention else 0
282
+ return f"TP 张数必须整除 num_heads={heads}。单节点(≤8 卡)候选:{valid_tp}。"
src/llm_cal/hardware/__init__.py ADDED
File without changes
src/llm_cal/hardware/gpu_database.yaml ADDED
@@ -0,0 +1,613 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPU database — v0.1.
2
+ #
3
+ # DATA PROVENANCE:
4
+ # Numeric specs (memory_gb, nvlink_bandwidth_gbps, fp16_tflops, fp8/fp4_support)
5
+ # come from public vendor datasheets and commonly-cited benchmarks. Each entry
6
+ # records its source in `spec_source` so users can audit.
7
+ #
8
+ # Conventions:
9
+ # - memory_gb: per-card HBM / GDDR in GB (vendor nominal)
10
+ # - nvlink_bandwidth_gbps: aggregate NVLink (or equivalent like xGMI/HCCS)
11
+ # bandwidth. 0 if the GPU has no high-bandwidth interconnect (e.g. consumer
12
+ # Ada removed NVLink).
13
+ # - fp16_tflops: peak dense FP16/BF16 with Tensor Cores; vendor cited figure.
14
+ # - fp8_support / fp4_support: whether the GPU has NATIVE Tensor Core
15
+ # acceleration for that precision. Software emulation does NOT count.
16
+ #
17
+ # To add a new GPU: append an entry with all required fields + spec_source.
18
+ # See docs/architecture-guide.md "How to add a new GPU".
19
+ schema_version: 1
20
+ gpus:
21
+ # ========================================================================
22
+ # NVIDIA Blackwell (2024+) — native FP4
23
+ # ========================================================================
24
+ - id: B200
25
+ aliases: [B200-SXM, B200-192G]
26
+ memory_gb: 192
27
+ nvlink_bandwidth_gbps: 1800
28
+ memory_bandwidth_gbps: 8000
29
+ fp16_tflops: 2250
30
+ fp8_support: true
31
+ fp4_support: true
32
+ spec_source: "NVIDIA Blackwell architecture overview (nvidia.com/blackwell)"
33
+ notes_en: "Blackwell flagship. Native FP4 Tensor Cores. First GPU that accelerates DeepSeek-V4-Flash-style FP4 at hardware level."
34
+ notes_zh: "Blackwell 旗舰。原生 FP4 Tensor Core,首款在硬件层加速 DeepSeek-V4-Flash 类 FP4 模型的 GPU。"
35
+
36
+ # ========================================================================
37
+ # NVIDIA Hopper (2022+)
38
+ # ========================================================================
39
+ - id: H100
40
+ aliases: [H100-SXM5, H100-80G, H100-SXM]
41
+ memory_gb: 80
42
+ nvlink_bandwidth_gbps: 900
43
+ memory_bandwidth_gbps: 3350
44
+ fp16_tflops: 989
45
+ fp8_support: true
46
+ fp4_support: false
47
+ spec_source: "NVIDIA H100 datasheet (nvidia.com/h100)"
48
+ notes_en: "Hopper flagship. Full NVLink."
49
+ notes_zh: "Hopper 架构旗舰,完整 NVLink 带宽。"
50
+
51
+ - id: H800
52
+ aliases: [H800-SXM5, H800-80G]
53
+ memory_gb: 80
54
+ nvlink_bandwidth_gbps: 400
55
+ memory_bandwidth_gbps: 3350
56
+ fp16_tflops: 989
57
+ fp8_support: true
58
+ fp4_support: false
59
+ spec_source: "NVIDIA H800 compliance variant — NVLink halved from H100 per US export controls"
60
+ notes_en: "China-regulated H100 variant. NVLink bandwidth halved (400 vs 900). Same HBM and compute as H100."
61
+ notes_zh: "H100 的中国合规版本。NVLink 带宽减半(400 vs 900 GB/s),HBM 容量和算力与 H100 相同。"
62
+
63
+ - id: H200
64
+ aliases: [H200-SXM, H200-141G]
65
+ memory_gb: 141
66
+ nvlink_bandwidth_gbps: 900
67
+ memory_bandwidth_gbps: 4800
68
+ fp16_tflops: 989
69
+ fp8_support: true
70
+ fp4_support: false
71
+ spec_source: "NVIDIA H200 datasheet (nvidia.com/h200)"
72
+ notes_en: "Hopper with HBM3e. 141 GB per GPU."
73
+ notes_zh: "搭载 HBM3e 的 Hopper,单卡 141 GB。"
74
+
75
+ - id: GH200
76
+ aliases: [Grace-Hopper, GH200-144G, GH200-96G]
77
+ memory_gb: 144
78
+ nvlink_bandwidth_gbps: 900
79
+ memory_bandwidth_gbps: 4800
80
+ fp16_tflops: 989
81
+ fp8_support: true
82
+ fp4_support: false
83
+ spec_source: "NVIDIA GH200 Grace Hopper datasheet 2023 (144GB HBM3e variant, dense FP16=989 TFLOPS; sparsity doubles it)"
84
+ notes_en: "Grace Hopper superchip — Hopper GPU + Grace CPU on one module. 144 GB HBM3e (96 GB HBM3 variant also exists). NVLink-C2C 900 GB/s CPU<->GPU unified. TDP programmable 450-1000W. Ideal for models that spill beyond single GPU memory because GPU can access CPU LPDDR coherently."
85
+ notes_zh: "Grace Hopper 超级芯片 — Hopper GPU + Grace CPU 融合模组。144 GB HBM3e(另有 96 GB HBM3 版本)。NVLink-C2C 让 CPU/GPU 共享统一内存空间,900 GB/s 双向。TDP 可编程 450-1000W。模型单卡显存装不下时,可一致地访问 CPU 的 LPDDR。"
86
+
87
+ - id: GB200
88
+ aliases: [Grace-Blackwell, GB200-per-GPU]
89
+ memory_gb: 192
90
+ nvlink_bandwidth_gbps: 1800
91
+ memory_bandwidth_gbps: 8000
92
+ fp16_tflops: 2250
93
+ fp8_support: true
94
+ fp4_support: true
95
+ spec_source: "NVIDIA GB200 Superchip datasheet 2024 — per-GPU view. Each GB200 = 2 B200 + Grace CPU. Per B200: 192 GB HBM3e, 8 TB/s, 2250 TFLOPS dense FP16 (4500 sparsity). Grace CPU adds up to 480 GB LPDDR5x accessible via NVLink-C2C."
96
+ notes_en: "Grace Blackwell superchip — 2 B200 GPUs + Grace CPU on one module. Per-GPU specs here match B200, but each GB200 module unlocks 384 GB HBM3e total (192+192) plus coherent access to 480 GB Grace CPU LPDDR5x. FP4 native. Only deployable in NVL4/NVL72 rack-scale systems with liquid cooling. Per-GPU TDP 1200W."
97
+ notes_zh: "Grace Blackwell 超级芯片 — 双 B200 GPU + Grace CPU 融合。此处展示单 GPU 视角规格,与 B200 基本一致。每块 GB200 模组合计 384 GB HBM3e(双卡),并通过 NVLink-C2C 一致访问 480 GB Grace CPU 的 LPDDR5x。原生 FP4。仅在 NVL4 / NVL72 液冷机架系统中部署。单 GPU TDP 1200W。"
98
+
99
+ - id: H20
100
+ aliases: [H20-96G, H20-SXM]
101
+ memory_gb: 96
102
+ nvlink_bandwidth_gbps: 900
103
+ memory_bandwidth_gbps: 4000
104
+ fp16_tflops: 148
105
+ fp8_support: true
106
+ fp4_support: false
107
+ spec_source: "NVIDIA H20 — released 2024 as China-compliant successor to H800. Compute heavily reduced (~15% of H100); memory bandwidth and HBM3e preserved."
108
+ notes_en: "China-compliance Hopper post-Oct-2023 export rules. Compute ~15% of H100 (148 vs 989 TFLOPS), but HBM3e memory bandwidth preserved. Good for memory-bound LLM inference, poor for training."
109
+ notes_zh: "2023 年 10 月出口管制后的中国合规 Hopper。算力仅为 H100 的约 15%(148 vs 989 TFLOPS),但 HBM3e 显存带宽保留。推理(显存带宽受限)尚可,训练基本不实用。"
110
+
111
+ # ========================================================================
112
+ # NVIDIA Ada Lovelace (datacenter) — FP8 yes, NVLink no
113
+ # ========================================================================
114
+ - id: L40S
115
+ aliases: [L40-S, L40S-48G]
116
+ memory_gb: 48
117
+ nvlink_bandwidth_gbps: 0
118
+ memory_bandwidth_gbps: 864
119
+ fp16_tflops: 362
120
+ fp8_support: true
121
+ fp4_support: false
122
+ spec_source: "NVIDIA L40S datasheet 2023"
123
+ notes_en: "Ada datacenter. 48 GB GDDR6. No NVLink — multi-GPU setups rely on PCIe. Cost-effective for small/medium model inference."
124
+ notes_zh: "Ada 架构数据中心卡,48 GB GDDR6。无 NVLink,多卡需走 PCIe。中小模型推理性价比高。"
125
+
126
+ - id: L40
127
+ aliases: [L40-48G]
128
+ memory_gb: 48
129
+ nvlink_bandwidth_gbps: 0
130
+ memory_bandwidth_gbps: 864
131
+ fp16_tflops: 181
132
+ fp8_support: true
133
+ fp4_support: false
134
+ spec_source: "NVIDIA L40 datasheet 2022"
135
+ notes_en: "Ada datacenter predecessor to L40S. Same 48 GB, half the compute. Widely deployed in enterprise clouds."
136
+ notes_zh: "L40S 的前代,Ada 架构数据中心卡。同为 48 GB,算力减半。企业私有云部署量较大。"
137
+
138
+ - id: L4
139
+ aliases: [L4-24G]
140
+ memory_gb: 24
141
+ nvlink_bandwidth_gbps: 0
142
+ memory_bandwidth_gbps: 300
143
+ fp16_tflops: 121
144
+ fp8_support: true
145
+ fp4_support: false
146
+ spec_source: "NVIDIA L4 datasheet 2023"
147
+ notes_en: "Low-profile Ada, 24 GB GDDR6. Common in low-concurrency inference / transcoding. No NVLink."
148
+ notes_zh: "低功耗 Ada,24 GB GDDR6。常用于低并发推理和转码场景。无 NVLink。"
149
+
150
+ - id: RTX6000-Ada
151
+ aliases: [RTX-6000-Ada, RTX6000Ada, L6000]
152
+ memory_gb: 48
153
+ nvlink_bandwidth_gbps: 0
154
+ memory_bandwidth_gbps: 960
155
+ fp16_tflops: 365
156
+ fp8_support: true
157
+ fp4_support: false
158
+ spec_source: "NVIDIA RTX 6000 Ada datasheet 2022"
159
+ notes_en: "Ada Pro workstation. 48 GB, similar to L40S but for workstations. FP8 yes, no NVLink."
160
+ notes_zh: "Ada Pro 工作站卡。48 GB,规格接近 L40S 但面向工作站。支持 FP8,无 NVLink。"
161
+
162
+ - id: RTX4090
163
+ aliases: ["4090", RTX-4090]
164
+ memory_gb: 24
165
+ nvlink_bandwidth_gbps: 0
166
+ memory_bandwidth_gbps: 1008
167
+ fp16_tflops: 165
168
+ fp8_support: true
169
+ fp4_support: false
170
+ spec_source: "NVIDIA RTX 4090 datasheet 2022"
171
+ notes_en: "Consumer Ada. No NVLink. Large models need multi-GPU via PCIe (slower)."
172
+ notes_zh: "消费级 Ada 架构,无 NVLink。大模型多卡只能走 PCIe(明显更慢)。"
173
+
174
+ # ========================================================================
175
+ # NVIDIA Ampere (2020+)
176
+ # ========================================================================
177
+ - id: A100-80G
178
+ aliases: [A100-80, A100-SXM-80G]
179
+ memory_gb: 80
180
+ nvlink_bandwidth_gbps: 600
181
+ memory_bandwidth_gbps: 2039
182
+ fp16_tflops: 312
183
+ fp8_support: false
184
+ fp4_support: false
185
+ spec_source: "NVIDIA A100 datasheet 2020"
186
+ notes_en: "Ampere. No native FP8. Still widely deployed."
187
+ notes_zh: "Ampere 架构。不原生支持 FP8,但部署量仍然非常大。"
188
+
189
+ - id: A100-40G
190
+ aliases: [A100-40, A100-SXM-40G]
191
+ memory_gb: 40
192
+ nvlink_bandwidth_gbps: 600
193
+ memory_bandwidth_gbps: 1555
194
+ fp16_tflops: 312
195
+ fp8_support: false
196
+ fp4_support: false
197
+ spec_source: "NVIDIA A100 40GB datasheet 2020"
198
+ notes_en: "Ampere 40 GB variant. Smaller HBM limits large-model single-node deployments."
199
+ notes_zh: "Ampere 的 40 GB 版本,显存较小,大模型单机部署受限。"
200
+
201
+ - id: A40
202
+ aliases: [A40-48G]
203
+ memory_gb: 48
204
+ nvlink_bandwidth_gbps: 112
205
+ memory_bandwidth_gbps: 696
206
+ fp16_tflops: 150
207
+ fp8_support: false
208
+ fp4_support: false
209
+ spec_source: "NVIDIA A40 datasheet 2020"
210
+ notes_en: "Ampere workstation. 48 GB with NVLink bridge (limited bandwidth). No FP8."
211
+ notes_zh: "Ampere 工作站卡,48 GB + NVLink 桥接(带宽较低)。不支持 FP8。"
212
+
213
+ - id: A10
214
+ aliases: [A10-24G]
215
+ memory_gb: 24
216
+ nvlink_bandwidth_gbps: 0
217
+ memory_bandwidth_gbps: 600
218
+ fp16_tflops: 125
219
+ fp8_support: false
220
+ fp4_support: false
221
+ spec_source: "NVIDIA A10 datasheet 2021"
222
+ notes_en: "Ampere inference card. 24 GB GDDR6. Widely used for low-cost inference in enterprise clouds."
223
+ notes_zh: "Ampere 推理卡,24 GB GDDR6。企业云低成本推理常用配置。"
224
+
225
+ - id: A10G
226
+ aliases: [A10G-24G]
227
+ memory_gb: 24
228
+ nvlink_bandwidth_gbps: 0
229
+ memory_bandwidth_gbps: 600
230
+ fp16_tflops: 125
231
+ fp8_support: false
232
+ fp4_support: false
233
+ spec_source: "NVIDIA A10G — AWS-specific variant of A10, g5 instances"
234
+ notes_en: "AWS-specific A10 variant. Same silicon as A10, deployed in g5 EC2 instances. No NVLink."
235
+ notes_zh: "AWS 定制版 A10,用于 g5 EC2 实例。核心规格与 A10 相同,无 NVLink。"
236
+
237
+ # ========================================================================
238
+ # NVIDIA Volta / Turing (older, still deployed)
239
+ # ========================================================================
240
+ - id: V100-SXM2-32G
241
+ aliases: [V100, V100-32G, V100-SXM2]
242
+ memory_gb: 32
243
+ nvlink_bandwidth_gbps: 300
244
+ memory_bandwidth_gbps: 900
245
+ fp16_tflops: 125
246
+ fp8_support: false
247
+ fp4_support: false
248
+ spec_source: "NVIDIA V100 SXM2 datasheet 2017"
249
+ notes_en: "Volta. No FP8. Still deployed in many existing clusters — works for smaller models, tight for 70B+."
250
+ notes_zh: "Volta 架构。不支持 FP8,但仍在大量老集群中服役。小模型够用,70B+ 紧张。"
251
+
252
+ - id: V100-PCIe-32G
253
+ aliases: [V100-PCIe, V100-PCI]
254
+ memory_gb: 32
255
+ nvlink_bandwidth_gbps: 0
256
+ memory_bandwidth_gbps: 900
257
+ fp16_tflops: 112
258
+ fp8_support: false
259
+ fp4_support: false
260
+ spec_source: "NVIDIA V100 PCIe datasheet 2017 — PCIe variant of V100, no NVLink."
261
+ notes_en: "PCIe version of V100. No NVLink, lower clocks than SXM2. Common in older servers."
262
+ notes_zh: "V100 的 PCIe 版本,无 NVLink,主频稍低。老服务器常见配置。"
263
+
264
+ - id: T4
265
+ aliases: [T4-16G]
266
+ memory_gb: 16
267
+ nvlink_bandwidth_gbps: 0
268
+ memory_bandwidth_gbps: 320
269
+ fp16_tflops: 65
270
+ fp8_support: false
271
+ fp4_support: false
272
+ spec_source: "NVIDIA T4 datasheet 2018"
273
+ notes_en: "Turing inference card. 16 GB, no NVLink, no FP8. Common as the cheapest cloud GPU option."
274
+ notes_zh: "Turing 推理卡。16 GB,无 NVLink,无 FP8。各云厂商最便宜的 GPU 选项之一。"
275
+
276
+ # ========================================================================
277
+ # AMD (ROCm, xGMI instead of NVLink)
278
+ # ========================================================================
279
+ - id: MI325X
280
+ aliases: [MI325X-256G, AMD-MI325X]
281
+ memory_gb: 256
282
+ nvlink_bandwidth_gbps: 896
283
+ memory_bandwidth_gbps: 6000
284
+ fp16_tflops: 1307
285
+ fp8_support: true
286
+ fp4_support: false
287
+ spec_source: "AMD Instinct MI325X datasheet 2024 — 256 GB HBM3E, 6 TB/s bandwidth, 1000W TDP, CDNA 3."
288
+ notes_en: "AMD flagship 2024. 256 GB HBM3E (largest single-card memory in v0.1 database). Upgraded MI300X with faster HBM3E and more capacity. Dense FP16 1307 TFLOPS, FP8 2615 TFLOPS. 1000W TDP, OAM format. ROCm software stack."
289
+ notes_zh: "AMD 2024 年旗舰。256 GB HBM3E(v0.1 数据库中单卡最大)。MI300X 升级版,HBM3E 更快、容量更大。Dense FP16 1307 TFLOPS,FP8 2615 TFLOPS。1000W TDP,OAM 形态。需要 ROCm 软件栈。"
290
+
291
+ - id: MI300X
292
+ aliases: [MI300X-192G, AMD-MI300X]
293
+ memory_gb: 192
294
+ nvlink_bandwidth_gbps: 896
295
+ memory_bandwidth_gbps: 5300
296
+ fp16_tflops: 1307
297
+ fp8_support: true
298
+ fp4_support: false
299
+ spec_source: "AMD Instinct MI300X datasheet 2023-12"
300
+ notes_en: "AMD flagship 2023. 192 GB HBM3. xGMI 896 GB/s (like NVLink). Software stack: ROCm + vLLM. Support for DeepSeek V4 etc. lags Nvidia by weeks."
301
+ notes_zh: "AMD 2023 年旗舰。192 GB HBM3。xGMI 互联 896 GB/s(类 NVLink)。需要 ROCm + vLLM 栈。新模型支持通常比 NVIDIA 晚几周。"
302
+
303
+ - id: MI250X
304
+ aliases: [MI250X-128G, AMD-MI250X]
305
+ memory_gb: 128
306
+ nvlink_bandwidth_gbps: 800
307
+ memory_bandwidth_gbps: 3280
308
+ fp16_tflops: 383
309
+ fp8_support: false
310
+ fp4_support: false
311
+ spec_source: "AMD Instinct MI250X datasheet 2022"
312
+ notes_en: "AMD previous-gen. 128 GB HBM2e. No FP8. Deployed in some HPC clusters (Frontier)."
313
+ notes_zh: "AMD 上代数据中心卡。128 GB HBM2e,不支持 FP8。少数 HPC 集群(如 Frontier 超算)有部署。"
314
+
315
+ - id: MI210
316
+ aliases: [MI210-64G, AMD-MI210]
317
+ memory_gb: 64
318
+ nvlink_bandwidth_gbps: 300
319
+ memory_bandwidth_gbps: 1600
320
+ fp16_tflops: 181
321
+ fp8_support: false
322
+ fp4_support: false
323
+ spec_source: "AMD Instinct MI210 datasheet 2022 — CDNA 2, single-die version of MI250. 64 GB HBM2e."
324
+ notes_en: "AMD CDNA 2 single-die. 64 GB HBM2e, 1.6 TB/s. No FP8 (CDNA 2 limitation). Common as entry-level AMD datacenter card."
325
+ notes_zh: "AMD CDNA 2 单 die 版本,64 GB HBM2e,1.6 TB/s 带宽。不支持 FP8(CDNA 2 限制)。AMD 入门数据中心卡常见配置。"
326
+
327
+ # ========================================================================
328
+ # Intel Habana Gaudi
329
+ # ========================================================================
330
+ - id: Gaudi3
331
+ aliases: [Gaudi-3, Habana-Gaudi3]
332
+ memory_gb: 128
333
+ nvlink_bandwidth_gbps: 1200
334
+ memory_bandwidth_gbps: 3700
335
+ fp16_tflops: 1835
336
+ fp8_support: true
337
+ fp4_support: false
338
+ spec_source: "Intel Gaudi 3 datasheet 2024"
339
+ notes_en: "Intel Habana Gaudi 3. 128 GB HBM2e. FP8 support. Software stack: SynapseAI (not CUDA). vLLM support via Intel fork."
340
+ notes_zh: "Intel Habana Gaudi 3。128 GB HBM2e,支持 FP8。软件栈为 SynapseAI(非 CUDA)。vLLM 需走 Intel 分支。"
341
+
342
+ - id: Gaudi2
343
+ aliases: [Gaudi-2, Habana-Gaudi2]
344
+ memory_gb: 96
345
+ nvlink_bandwidth_gbps: 2400
346
+ memory_bandwidth_gbps: 2450
347
+ fp16_tflops: 432
348
+ fp8_support: true
349
+ fp4_support: false
350
+ spec_source: "Intel Gaudi 2 datasheet 2022"
351
+ notes_en: "Intel Habana Gaudi 2. 96 GB HBM2e with 24x100GbE on-board (used for scale-out). FP8 support."
352
+ notes_zh: "Intel Habana Gaudi 2。96 GB HBM2e,板载 24 个 100GbE(用于横向扩展)。支持 FP8。"
353
+
354
+ # ========================================================================
355
+ # Huawei Ascend
356
+ # ========================================================================
357
+ # The 910B "series" is actually a set of sub-variants (B1/B2/B3/B4) with
358
+ # different compute tiers and memory sizes. `910B` as a plain id resolves
359
+ # to 910B3 (the most common training configuration).
360
+ - id: "910A"
361
+ aliases: [Ascend-910A]
362
+ memory_gb: 32
363
+ nvlink_bandwidth_gbps: 400
364
+ memory_bandwidth_gbps: 1200
365
+ fp16_tflops: 256
366
+ fp8_support: false
367
+ fp4_support: false
368
+ spec_source: "Ascend 910 (1st gen) — 7nm, 32 GB HBM. Community-compiled spec."
369
+ notes_en: "Huawei Ascend 910 (1st gen, 2019). Predecessor to 910B. Still deployed in many older clusters. HCCS interconnect."
370
+ notes_zh: "华为昇腾 910 第一代(2019 年),910B 的前身。很多老集群仍在使用。HCCS 互联。"
371
+
372
+ - id: "910B1"
373
+ aliases: [Ascend-910B1]
374
+ memory_gb: 64
375
+ nvlink_bandwidth_gbps: 400
376
+ memory_bandwidth_gbps: 1600
377
+ fp16_tflops: 414
378
+ fp8_support: false
379
+ fp4_support: false
380
+ spec_source: "Ascend 910B1 — training variant, Atlas 800T A2. Commonly cited as top-tier 910B sub-variant; TSMC 7nm process."
381
+ notes_en: "Top-tier 910B training variant. 64 GB HBM2, 414 TFLOPS FP16. Used in Atlas 800T A2 training servers. No native FP8."
382
+ notes_zh: "910B 系列顶配训练版本。64 GB HBM2,FP16 算力 414 TFLOPS。搭载于 Atlas 800T A2 训练服务器。不原生支持 FP8。"
383
+
384
+ - id: "910B2"
385
+ aliases: [Ascend-910B2]
386
+ memory_gb: 64
387
+ nvlink_bandwidth_gbps: 400
388
+ memory_bandwidth_gbps: 1600
389
+ fp16_tflops: 376
390
+ fp8_support: false
391
+ fp4_support: false
392
+ spec_source: "Ascend 910B2 — training variant, commonly cited as standard 910B training configuration."
393
+ notes_en: "Standard 910B training variant. 64 GB HBM2, 376 TFLOPS FP16. General-purpose training server baseline."
394
+ notes_zh: "910B 常规训练版本。64 GB HBM2,FP16 算力 376 TFLOPS。通用训练服务器标准配置。"
395
+
396
+ - id: "910B3"
397
+ aliases: [Ascend-910B3, "910B", Ascend-910B]
398
+ memory_gb: 64
399
+ nvlink_bandwidth_gbps: 400
400
+ memory_bandwidth_gbps: 1600
401
+ fp16_tflops: 313
402
+ fp8_support: false
403
+ fp4_support: false
404
+ spec_source: "Ascend 910B3 — training variant, SMIC-produced per industry reports. (aliased as bare `910B` for convenience)"
405
+ notes_en: "910B3 training variant, 313 TFLOPS FP16. Believed to be SMIC-produced (vs TSMC for B1/B2). The `910B` bare name resolves here since B3 is the most commonly referenced."
406
+ notes_zh: "910B3 训练版本,FP16 算力 313 TFLOPS。业界普遍认为由中芯国际生产(B1/B2 据传为台积电)。裸写 `910B` 时默认解析到此条目(最常被引用)。"
407
+
408
+ - id: "910B4"
409
+ aliases: [Ascend-910B4]
410
+ memory_gb: 32
411
+ nvlink_bandwidth_gbps: 400
412
+ memory_bandwidth_gbps: 1600
413
+ fp16_tflops: 280
414
+ fp8_support: false
415
+ fp4_support: false
416
+ spec_source: "Ascend 910B4 — inference variant, 32 GB HBM (half of B1/B2/B3). Atlas 800I A2 inference server."
417
+ notes_en: "910B4 is the inference-oriented 910B variant. 32 GB HBM (half of training variants), 280 TFLOPS FP16. Deployed in Atlas 800I A2 inference servers."
418
+ notes_zh: "910B4 是 910B 系列的推理版本。32 GB HBM(训练版本的一半),FP16 算力 280 TFLOPS。搭载于 Atlas 800I A2 推理服务器。"
419
+
420
+ - id: "910C"
421
+ aliases: [Ascend-910C]
422
+ memory_gb: 64
423
+ nvlink_bandwidth_gbps: 400
424
+ memory_bandwidth_gbps: 3200
425
+ fp16_tflops: 780
426
+ fp8_support: false
427
+ fp4_support: false
428
+ spec_source: "Huawei Ascend 910C — launched 2024, commonly cited specs pending official datasheet"
429
+ notes_en: "Huawei Ascend 910C (2024). Roughly 2x compute vs 910B at similar memory. FP8 support status unclear — check CANN version notes. Software ecosystem matures but still behind NVIDIA."
430
+ notes_zh: "华为昇腾 910C(2024 年)。算力大约是 910B 的两倍,显存相当。FP8 支持情况需看 CANN 版本。软件生态持续完善但仍落后于 NVIDIA。"
431
+
432
+ - id: Atlas-300I-Duo
433
+ aliases: [Atlas300IDuo, 300I-Duo]
434
+ memory_gb: 48
435
+ nvlink_bandwidth_gbps: 0
436
+ memory_bandwidth_gbps: 204
437
+ fp16_tflops: 140
438
+ fp8_support: false
439
+ fp4_support: false
440
+ spec_source: "Huawei Atlas 300I Duo inference card — 2x Ascend 310P per card. 140 TFLOPS FP16 per card, 48 GB LPDDR4X."
441
+ notes_en: "Huawei Atlas 300I Duo inference card: 2x Ascend 310P with combined 48 GB LPDDR4X (96 GB variant available). 280 TOPS INT8. LPDDR4X gives 204 GB/s total bandwidth — much lower than HBM-based cards. PCIe-only, no NVLink. Best for cost-sensitive inference."
442
+ notes_zh: "华为 Atlas 300I Duo 推理卡:双 Ascend 310P,合计 48 GB LPDDR4X(另有 96 GB 版本)。INT8 280 TOPS。显存是 LPDDR4X,带宽 204 GB/s,远低于 HBM 卡。仅 PCIe,无 NVLink。主要面向成本敏感的推理场景。"
443
+
444
+ # ========================================================================
445
+ # Chinese domestic AI accelerators (non-NVIDIA / non-AMD)
446
+ # ========================================================================
447
+ - id: MXC500
448
+ aliases: [MetaX-MXC500, XiYun-C500, 曦云C500]
449
+ memory_gb: 64
450
+ nvlink_bandwidth_gbps: 800
451
+ memory_bandwidth_gbps: 1800
452
+ fp16_tflops: 240
453
+ fp8_support: false
454
+ fp4_support: false
455
+ spec_source: "MetaX 沐曦 MXC500 / 曦云 C500 (PCIe variant, 350W). OAM variant has 280 TFLOPS FP16 @ 450W. 64 GB HBM2e, 1.8 TB/s memory bandwidth, MetaXLink interconnect."
456
+ notes_en: "MetaX (沐曦) MXC500. 7nm, CUDA-compatible via MXMACA stack. PCIe variant: 240 TFLOPS FP16, 350W. OAM variant: 280 TFLOPS FP16, 450W. Targets A100-class workloads. No native FP8."
457
+ notes_zh: "沐曦曦云 C500。7nm 工艺,通过 MXMACA 软件栈兼容 CUDA。PCIe 版本 FP16 240 TFLOPS / 350W,OAM 版本 280 TFLOPS / 450W。对标 A100 场景。不原生支持 FP8。"
458
+
459
+ - id: MXC550
460
+ aliases: [MetaX-MXC550, XiYun-C550, 曦云C550]
461
+ memory_gb: 64
462
+ nvlink_bandwidth_gbps: 896
463
+ memory_bandwidth_gbps: 1600
464
+ fp16_tflops: 240
465
+ fp8_support: false
466
+ fp4_support: false
467
+ spec_source: "MetaX 沐曦 MXC550 / 曦云 C550 (OAM, 2024). Partial specs from third-party comparison docs; full datasheet TBD. 8-card fabric bandwidth 896 GB/s."
468
+ notes_en: "MetaX (沐曦) MXC550 — 2024 OAM-format flagship. Supports OAM 1.5 + 2.0. 8-card fabric bandwidth 896 GB/s. Full specs pending official datasheet — figures here are from third-party comparison articles."
469
+ notes_zh: "沐曦曦云 C550 — 2024 年 OAM 形态旗舰。支持 OAM 1.5 + 2.0 规范。八卡全互联带宽 896 GB/s。完整规格待官方数据表披露,此处数字来自第三方对比资料。"
470
+
471
+ - id: Kunlun-P800
472
+ aliases: [KunlunXin-P800, 昆仑芯P800, Kunlun-Gen3]
473
+ memory_gb: 96
474
+ nvlink_bandwidth_gbps: 400
475
+ memory_bandwidth_gbps: 2000
476
+ fp16_tflops: 345
477
+ fp8_support: true
478
+ fp4_support: false
479
+ spec_source: "KunlunXin P800 (3rd gen, 2024). 96 GB HBM3 (largest among Chinese domestic AI chips). Baidu Cloud uses P800 for first-party inference. Specs partially inferred from public Baidu announcements; official datasheet limited distribution."
480
+ notes_en: "Baidu KunlunXin P800 — 3rd gen, 2024. 96 GB HBM3. Reported to support 8-bit inference and MoE optimizations. Baidu's internal clusters run Kunlun P800 at 10k+ card scale. Figures here are from public Baidu materials; official spec sheet not fully public."
481
+ notes_zh: "百度昆仑芯 P800 — 第三代,2024 年。96 GB HBM3(国产 AI 芯片中显存最大之一)。报告支持 8bit 推理和 MoE 优化。百度内部 1 万卡以上规模部署。数字来自百度公开资料,完整规格表未完全披露。"
482
+
483
+ - id: Kunlun-R200
484
+ aliases: [KunlunXin-R200, 昆仑芯R200, Kunlun-Gen2]
485
+ memory_gb: 32
486
+ nvlink_bandwidth_gbps: 200
487
+ memory_bandwidth_gbps: 512
488
+ fp16_tflops: 128
489
+ fp8_support: false
490
+ fp4_support: false
491
+ spec_source: "KunlunXin R200 (2nd gen, 2021). 7nm XPU architecture. FP16 128 TFLOPS / INT8 256 TOPS."
492
+ notes_en: "Baidu KunlunXin R200 — 2nd gen, 7nm. FP16 128 TFLOPS, INT8 256 TOPS. XPU architecture. PCIe 4.0 + XCCL interconnect. No FP8."
493
+ notes_zh: "百度昆仑芯 R200 — 第二代,7nm XPU 架构。FP16 128 TFLOPS,INT8 256 TOPS。PCIe 4.0 + 昆仑芯互联 XCCL。无 FP8。"
494
+
495
+ - id: BR100
496
+ aliases: [Biren-BR100, 壁仞BR100, 壁砺100]
497
+ memory_gb: 64
498
+ nvlink_bandwidth_gbps: 512
499
+ memory_bandwidth_gbps: 1640
500
+ fp16_tflops: 1024
501
+ fp8_support: false
502
+ fp4_support: false
503
+ spec_source: "Biren 壁仞 BR100 (OAM, 550W). 7nm Chiplet, 77B transistors. BF16/FP16 1024 TFLOPS, INT8 2048 TOPS, 64 GB HBM2e 1.64 TB/s. BLINK 512 GB/s 8-card fabric."
504
+ notes_en: "Biren BR100 (壁仞) — 2022 flagship. OAM format, 550W. 1024 TFLOPS BF16/FP16 (PFLOPS class), 64 GB HBM2e. BLINK interconnect 512 GB/s (8-card fabric). No FP8. US export-restricted since 2022 — production status uncertain."
505
+ notes_zh: "壁仞 BR100 — 2022 年旗舰 OAM 卡,550W。BF16/FP16 1024 TFLOPS(PFLOPS 级),64 GB HBM2e。BLINK 互联 512 GB/s(8 卡全互联)。无 FP8。2022 年被美国出口管制,后续量产状态不明。"
506
+
507
+ - id: BR104
508
+ aliases: [Biren-BR104, 壁仞BR104, 壁砺104]
509
+ memory_gb: 32
510
+ nvlink_bandwidth_gbps: 128
511
+ memory_bandwidth_gbps: 820
512
+ fp16_tflops: 512
513
+ fp8_support: false
514
+ fp4_support: false
515
+ spec_source: "Biren 壁仞 BR104 (PCIe, 300W). Single-die version of BR100 with halved specs. BF16/FP16 512 TFLOPS, 32 GB HBM2e. Won MLPerf Inference ResNet50 and BERT single-card top-1 in its class."
516
+ notes_en: "Biren BR104 — PCIe single-die version of BR100. 300W, 512 TFLOPS BF16/FP16, 32 GB HBM2e. Won MLPerf Inference BERT (1.58x A100 in server mode). No FP8. Export-restricted."
517
+ notes_zh: "壁仞 BR104 — BR100 的单 die PCIe 版本。300W,BF16/FP16 512 TFLOPS,32 GB HBM2e。MLPerf Inference BERT 测试 server 模式性能达 A100 的 1.58 倍。无 FP8。已被出口管制。"
518
+
519
+ - id: BI-V100
520
+ aliases: [Iluvatar-BI-V100, 天数天垓100, TianGai-100]
521
+ memory_gb: 32
522
+ nvlink_bandwidth_gbps: 64
523
+ memory_bandwidth_gbps: 1200
524
+ fp16_tflops: 147
525
+ fp8_support: false
526
+ fp4_support: false
527
+ spec_source: "Iluvatar CoreX 天数智芯 BI-V100 (天垓100). 7nm, SIMT, 24B transistors, 2.5D CoWoS packaging. FP16 147 TFLOPS / INT8 295 TOPS. 32 GB HBM2, 1.2 TB/s bandwidth. PCIe 4.0 x16, 250W TDP."
528
+ notes_en: "Iluvatar (天数智芯) BI-V100 — training/general-purpose. 7nm SIMT architecture, 32 GB HBM2, 1.2 TB/s memory bandwidth. FP16 147 TFLOPS, INT8 295 TOPS. 250W TDP. Interconnect bandwidth per card is modest (~64 GB/s shared)."
529
+ notes_zh: "天数智芯 BI-V100(天垓100)— 训练/通用 GPU。7nm SIMT 架构,32 GB HBM2,1.2 TB/s 显存带宽。FP16 147 TFLOPS,INT8 295 TOPS。250W TDP。单卡互联带宽 ~64 GB/s,相对较低。"
530
+
531
+ - id: MR-V100
532
+ aliases: [Iluvatar-MR-V100, 天数智铠100, ZhiKai-100]
533
+ memory_gb: 32
534
+ nvlink_bandwidth_gbps: 0
535
+ memory_bandwidth_gbps: 1200
536
+ fp16_tflops: 100
537
+ fp8_support: false
538
+ fp4_support: false
539
+ spec_source: "Iluvatar CoreX 天数智芯 智铠100 (MR-V100) 2022. Inference card, 32 GB HBM2E, ~200 TFLOPS BF16/FP16-low-precision-aggregated, 128-channel 1080p video decode, 150W TDP."
540
+ notes_en: "Iluvatar inference card (智铠100). 32 GB HBM2E. 150W TDP. Primarily inference-focused — mixed-precision aggregated throughput ~200 TFLOPS."
541
+ notes_zh: "天数智芯智铠100 推理卡。32 GB HBM2E,150W TDP。主要面向推理场景,混合精度聚合算力约 200 TFLOPS。"
542
+
543
+ - id: MLU370-X8
544
+ aliases: [Cambricon-MLU370-X8, 寒武纪MLU370-X8, 思元370-X8]
545
+ memory_gb: 48
546
+ nvlink_bandwidth_gbps: 200
547
+ memory_bandwidth_gbps: 614
548
+ fp16_tflops: 48
549
+ fp8_support: false
550
+ fp4_support: false
551
+ spec_source: "Cambricon 寒武纪 MLU370-X8 (dual MLU370 chiplet, 250W). 48 GB LPDDR5, INT8 256 TOPS, FP32 24 TFLOPS (FP16 ~48 TFLOPS estimated, official not given). MLU-Link 200 GB/s."
552
+ notes_en: "Cambricon (寒武纪) MLU370-X8 — dual-chip package, 250W. 48 GB LPDDR5 (not HBM), INT8 256 TOPS, FP32 24 TFLOPS. MLU-Link 200 GB/s for 8-card setups. LPDDR5 means lower memory bandwidth than HBM cards."
553
+ notes_zh: "寒武纪 MLU370-X8 — 双芯粒封装,250W。48 GB LPDDR5(非 HBM),INT8 256 TOPS,FP32 24 TFLOPS。MLU-Link 200 GB/s,支持 8 卡部署。LPDDR5 意味着显存带宽低于 HBM 卡。"
554
+
555
+ - id: MLU590
556
+ aliases: [Cambricon-MLU590, 寒武纪MLU590, 思元590]
557
+ memory_gb: 80
558
+ nvlink_bandwidth_gbps: 372
559
+ memory_bandwidth_gbps: 2000
560
+ fp16_tflops: 314
561
+ fp8_support: false
562
+ fp4_support: false
563
+ spec_source: "Cambricon 寒武纪 思元590 (MLU590) — 7nm, MLUv02/MLUarch05. 80 GB HBM (likely HBM2e based on 2 TB/s bandwidth), FP16 314 TFLOPS, FP32 80 TFLOPS, MLU-Link 372 GB/s. Used at Baidu ERNIE (文心一言) project."
564
+ notes_en: "Cambricon (寒武纪) MLU590 — flagship AI training chip. 80 GB HBM, 2 TB/s memory bandwidth. FP16 314 TFLOPS (dense). MLU-Link 372 GB/s 8-card fabric. Comparable FP16 compute to NVIDIA A100 level. No FP8. Production volume and ecosystem still maturing."
565
+ notes_zh: "寒武纪思元590 — 旗舰 AI 训练芯片。80 GB HBM,2 TB/s 显存带宽。FP16 314 TFLOPS(dense),综合性能约为 A100 级别。MLU-Link 372 GB/s 八卡互联。无 FP8。量产规模和生态仍在成熟。"
566
+
567
+ - id: Hygon-K100-AI
568
+ aliases: [K100-AI, 海光K100AI, DCU-K100-AI]
569
+ memory_gb: 64
570
+ nvlink_bandwidth_gbps: 184
571
+ memory_bandwidth_gbps: 896
572
+ fp16_tflops: 192
573
+ fp8_support: false
574
+ fp4_support: false
575
+ spec_source: "Hygon 海光 K100 AI — DCU architecture (GPGPU+AI hybrid), 64 GB HBM, 896 GB/s memory bandwidth, 350W TDP. FP16 192 TFLOPS dense (some sources cite 256 TFLOPS but values vary). xGMI 184 GB/s."
576
+ notes_en: "Hygon (海光) K100 AI — DCU series. 64 GB HBM, 896 GB/s bandwidth. FP16 192 TFLOPS (industry reports vary 100-256 TFLOPS depending on compute unit/mode). ROCm-compatible, can leverage AMD software ecosystem. Positioned against A800 for Chinese market. 350W TDP."
577
+ notes_zh: "海光 K100 AI — DCU 系列。64 GB HBM,896 GB/s 带宽。FP16 192 TFLOPS(公开资料数字因计算单元和精度模式不同有 100-256 TFLOPS 差异)。兼容 ROCm,可复用 AMD 软件生态。面向国产 A800 替代场景。350W TDP。"
578
+
579
+ - id: Hygon-Z100
580
+ aliases: [Z100, 海光Z100, DCU-Z100, 深算二号]
581
+ memory_gb: 32
582
+ nvlink_bandwidth_gbps: 184
583
+ memory_bandwidth_gbps: 1000
584
+ fp16_tflops: 180
585
+ fp8_support: false
586
+ fp4_support: false
587
+ spec_source: "Hygon 海光 DCU Z100 (深算二号) — 32 GB HBM2, 1 TB/s bandwidth, 8192 compute cores, FP32 90 TFLOPS, FP16 ~180 TFLOPS (2x FP32), FP64 10.8 TFLOPS. xGMI 184 GB/s. Performance reported as 80-90% of A100. 350W TDP."
588
+ notes_en: "Hygon (海光) DCU Z100 / 深算二号. 32 GB HBM2, 1 TB/s bandwidth, 8192 compute units. FP16 180 TFLOPS, FP32 90 TFLOPS, FP64 10.8 TFLOPS. 350W. Performance cited at 80-90% of A100. ROCm stack, PCIe Gen4 + xGMI multi-card."
589
+ notes_zh: "海光 DCU Z100(深算二号)。32 GB HBM2,1 TB/s 带宽,8192 计算单元。FP16 180 TFLOPS,FP32 90 TFLOPS,FP64 10.8 TFLOPS。350W。综合性能约为 A100 的 80-90%。基于 ROCm 栈,PCIe Gen4 + xGMI 多卡互联。"
590
+
591
+ - id: MTT-S4000
592
+ aliases: [MooreThreads-S4000, 摩尔线程S4000, MTT-S4000-48G]
593
+ memory_gb: 48
594
+ nvlink_bandwidth_gbps: 240
595
+ memory_bandwidth_gbps: 768
596
+ fp16_tflops: 100
597
+ fp8_support: false
598
+ fp4_support: false
599
+ spec_source: "Moore Threads MTT S4000 datasheet 2023 — 3rd-gen MUSA (曲院). 48 GB GDDR6, 768 GB/s bandwidth. FP16/BF16 100 TFLOPS, INT8 200 TOPS. MTLink 1.0 240 GB/s."
600
+ notes_en: "Moore Threads (摩尔线程) S4000 — domestic AI training card. 48 GB GDDR6 (not HBM), 768 GB/s. FP16/BF16 100 TFLOPS. MTLink 1.0 240 GB/s. CUDA compatibility via MUSA translation."
601
+ notes_zh: "摩尔线程 S4000 — 国产训推加速卡。48 GB GDDR6(非 HBM),768 GB/s 带宽。FP16/BF16 100 TFLOPS。MTLink 1.0 互联 240 GB/s。通过 MUSA 兼容 CUDA 生态。"
602
+
603
+ - id: MTT-S3000
604
+ aliases: [MooreThreads-S3000, 摩尔线程S3000]
605
+ memory_gb: 32
606
+ nvlink_bandwidth_gbps: 0
607
+ memory_bandwidth_gbps: 448
608
+ fp16_tflops: 30
609
+ fp8_support: false
610
+ fp4_support: false
611
+ spec_source: "Moore Threads MTT S3000 — MUSA 春晓 architecture. 32 GB GDDR6, 448 GB/s. FP32 ~15.2 TFLOPS inferred from S4000 comparison (S4000 is 64%+ higher); FP16 ~30 TFLOPS estimate (datasheet not fully public)."
612
+ notes_en: "Moore Threads (摩尔线程) S3000 — predecessor to S4000. 32 GB GDDR6, 448 GB/s. FP16 specs not fully published; estimated ~30 TFLOPS based on S4000 comparison. Multi-purpose server GPU, also supports rendering."
613
+ notes_zh: "摩尔线程 S3000 — S4000 的前代。32 GB GDDR6,448 GB/s。FP16 官方未完全披露,基于 S4000 对比推算约 30 TFLOPS。通用服务器 GPU,兼顾渲染场景。"
src/llm_cal/hardware/loader.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Hardware database loader + lookup."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from functools import lru_cache
6
+ from importlib.resources import files
7
+ from pathlib import Path
8
+ from typing import Literal
9
+
10
+ from pydantic import BaseModel, Field
11
+
12
+ from llm_cal.common.yaml_loader import load_yaml
13
+
14
+
15
+ class GPUSpec(BaseModel):
16
+ """One GPU entry in the hardware database."""
17
+
18
+ id: str
19
+ aliases: list[str] = Field(default_factory=list)
20
+ memory_gb: int
21
+ nvlink_bandwidth_gbps: int
22
+ # HBM/GDDR memory bandwidth (NOT NVLink). This is the critical number for
23
+ # decode throughput: decode is memory-bandwidth-bound, and per-token
24
+ # latency = active_weight_bytes / (memory_bandwidth × utilization).
25
+ # 0 or None means unknown (performance module will skip bandwidth checks).
26
+ memory_bandwidth_gbps: int | None = None
27
+ fp16_tflops: float
28
+ fp8_support: bool
29
+ fp4_support: bool
30
+ notes_en: str | None = None
31
+ notes_zh: str | None = None
32
+ # Where the numeric specs came from. A URL to a vendor datasheet / trusted
33
+ # benchmark, or a short note like "NVIDIA H100 datasheet 2024-Q3". Lets
34
+ # users audit the source; honesty-over-convenience principle.
35
+ spec_source: str | None = None
36
+
37
+ def localized_notes(self, locale: Literal["en", "zh"]) -> str | None:
38
+ if locale == "zh":
39
+ return self.notes_zh or self.notes_en
40
+ return self.notes_en or self.notes_zh
41
+
42
+
43
+ class GPUDatabase(BaseModel):
44
+ schema_version: int
45
+ gpus: list[GPUSpec]
46
+
47
+
48
+ class UnknownGPUError(Exception):
49
+ """User asked for a GPU id we don't know."""
50
+
51
+
52
+ def _default_path() -> Path:
53
+ """Locate the bundled gpu_database.yaml inside the installed package."""
54
+ return Path(str(files("llm_cal.hardware").joinpath("gpu_database.yaml")))
55
+
56
+
57
+ @lru_cache(maxsize=1)
58
+ def load_database(path: Path | None = None) -> GPUDatabase:
59
+ return load_yaml(path or _default_path(), GPUDatabase)
60
+
61
+
62
+ def lookup(gpu: str, db: GPUDatabase | None = None) -> GPUSpec:
63
+ """Look up a GPU by id or alias. Case-insensitive."""
64
+ database = db or load_database()
65
+ target = gpu.strip().upper()
66
+ for spec in database.gpus:
67
+ if spec.id.upper() == target:
68
+ return spec
69
+ if any(alias.upper() == target for alias in spec.aliases):
70
+ return spec
71
+ # Helpful rejection
72
+ if "X" in target and target.split("X")[-1].isdigit():
73
+ raise UnknownGPUError(
74
+ f"'{gpu}' looks like old 'H800x8' format. "
75
+ f"Use `--gpu {target.split('X')[0]} --gpu-count {target.split('X')[-1]}` instead."
76
+ )
77
+ raise UnknownGPUError(f"Unknown GPU '{gpu}'. Known: {', '.join(s.id for s in database.gpus)}")
src/llm_cal/llm_review/__init__.py ADDED
File without changes
src/llm_cal/llm_review/reviewer.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Optional LLM-based second opinion on the tool's derivation trace.
2
+
3
+ Design constraints (from the tool's honesty principle):
4
+ 1. Never overrides the 6 primary labels. LLM responses are tagged
5
+ [llm-opinion] — a distinct 7th label.
6
+ 2. Opt-in only — requires --llm-review flag AND env vars set.
7
+ 3. Non-fatal — if the API call fails, the main report still works.
8
+ 4. User-chosen provider — supports any OpenAI-compatible endpoint
9
+ (OpenAI, DeepSeek, Moonshot, Zhipu, local vLLM, etc.)
10
+ 5. Deterministic input — the prompt is built from the --explain
11
+ derivation trace, not free-form. The LLM gets structured math,
12
+ not prose.
13
+ 6. The LLM's job is to CRITIQUE, not to REWRITE. The prompt
14
+ explicitly forbids generating new numbers.
15
+
16
+ Environment variables:
17
+ LLM_CAL_REVIEWER_API_KEY (required)
18
+ LLM_CAL_REVIEWER_BASE_URL (default: https://api.openai.com/v1)
19
+ LLM_CAL_REVIEWER_MODEL (default: gpt-4o)
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import os
25
+ from dataclasses import dataclass
26
+ from typing import Literal
27
+
28
+ import httpx
29
+
30
+ from llm_cal.core.explain import ExplainEntry
31
+
32
+ Locale = Literal["en", "zh"]
33
+
34
+
35
+ @dataclass(frozen=True)
36
+ class LLMReviewResult:
37
+ ok: bool
38
+ content: str | None
39
+ error: str | None
40
+ model: str
41
+ base_url: str
42
+
43
+
44
+ def run_review(
45
+ entries: list[ExplainEntry],
46
+ locale: Locale,
47
+ timeout_s: float = 60.0,
48
+ ) -> LLMReviewResult:
49
+ """Send the derivation trace to an LLM for audit.
50
+
51
+ Returns a LLMReviewResult. Never raises — always returns a result
52
+ object even on failure.
53
+ """
54
+ api_key = os.environ.get("LLM_CAL_REVIEWER_API_KEY")
55
+ base_url = os.environ.get("LLM_CAL_REVIEWER_BASE_URL", "https://api.openai.com/v1").rstrip("/")
56
+ model = os.environ.get("LLM_CAL_REVIEWER_MODEL", "gpt-4o")
57
+
58
+ if not api_key:
59
+ return LLMReviewResult(
60
+ ok=False,
61
+ content=None,
62
+ error=(
63
+ "LLM_CAL_REVIEWER_API_KEY env var not set. "
64
+ "Set it to the API key of an OpenAI-compatible endpoint "
65
+ "(OpenAI, DeepSeek, Moonshot, Zhipu, etc.)."
66
+ ),
67
+ model=model,
68
+ base_url=base_url,
69
+ )
70
+
71
+ prompt = _build_prompt(entries, locale)
72
+
73
+ try:
74
+ with httpx.Client(timeout=timeout_s) as client:
75
+ resp = client.post(
76
+ f"{base_url}/chat/completions",
77
+ headers={
78
+ "Authorization": f"Bearer {api_key}",
79
+ "Content-Type": "application/json",
80
+ },
81
+ json={
82
+ "model": model,
83
+ "messages": [
84
+ {"role": "system", "content": _system_prompt(locale)},
85
+ {"role": "user", "content": prompt},
86
+ ],
87
+ "temperature": 0.1,
88
+ "max_tokens": 6000,
89
+ },
90
+ )
91
+ except (httpx.TimeoutException, httpx.ConnectError) as e:
92
+ return LLMReviewResult(
93
+ ok=False,
94
+ content=None,
95
+ error=f"{type(e).__name__}: {e}",
96
+ model=model,
97
+ base_url=base_url,
98
+ )
99
+
100
+ if resp.status_code != 200:
101
+ return LLMReviewResult(
102
+ ok=False,
103
+ content=None,
104
+ error=f"HTTP {resp.status_code}: {resp.text[:500]}",
105
+ model=model,
106
+ base_url=base_url,
107
+ )
108
+
109
+ try:
110
+ data = resp.json()
111
+ content = data["choices"][0]["message"]["content"]
112
+ except (KeyError, ValueError) as e:
113
+ return LLMReviewResult(
114
+ ok=False,
115
+ content=None,
116
+ error=f"Malformed response: {type(e).__name__}: {e}",
117
+ model=model,
118
+ base_url=base_url,
119
+ )
120
+
121
+ return LLMReviewResult(ok=True, content=content, error=None, model=model, base_url=base_url)
122
+
123
+
124
+ def _system_prompt(locale: Locale) -> str:
125
+ if locale == "zh":
126
+ return (
127
+ "你是一个大模型推理硬件计算工具的独立审计者。工具产出确定性的推导链,"
128
+ "你的工作是发现数学错误、不合理假设或遗漏。你不负责重新计算,"
129
+ "只负责评论和确认。输出简体中文。"
130
+ )
131
+ return (
132
+ "You are an independent auditor for a deterministic LLM inference hardware "
133
+ "calculator. The tool produces a derivation trace; your job is to find math "
134
+ "errors, unreasonable assumptions, or missing considerations. You do NOT "
135
+ "recalculate; you only critique and confirm."
136
+ )
137
+
138
+
139
+ def _build_prompt(entries: list[ExplainEntry], locale: Locale) -> str:
140
+ trace = "\n\n".join(_format_entry(e) for e in entries)
141
+ if locale == "zh":
142
+ return _prompt_zh(trace)
143
+ return _prompt_en(trace)
144
+
145
+
146
+ def _format_entry(entry: ExplainEntry) -> str:
147
+ parts: list[str] = [f"## {entry.heading}"]
148
+ parts.append(f"Formula:\n{entry.formula}")
149
+ if entry.inputs:
150
+ parts.append("Inputs:")
151
+ for inp in entry.inputs:
152
+ note = f" ({inp.note})" if inp.note else ""
153
+ parts.append(f" - {inp.name} = {inp.value} {inp.label}{note}")
154
+ if entry.steps:
155
+ parts.append("Steps:")
156
+ for step in entry.steps:
157
+ parts.append(f" {step}")
158
+ parts.append(f"Result: {entry.result}")
159
+ if entry.source:
160
+ parts.append(f"Source: {entry.source}")
161
+ return "\n".join(parts)
162
+
163
+
164
+ def _prompt_en(trace: str) -> str:
165
+ return f"""The deterministic tool produced this derivation trace for one model evaluation. \
166
+ Audit it.
167
+
168
+ <DERIVATION_TRACE>
169
+ {trace}
170
+ </DERIVATION_TRACE>
171
+
172
+ Respond in this structure. If a section has nothing to flag, write "none".
173
+
174
+ ## Critical issues
175
+ (math errors or wrong formulas — would give wrong final answer)
176
+
177
+ ## Moderate concerns
178
+ (unreasonable assumptions, factors off by 2x+, missing TP/sharding effects, etc.)
179
+
180
+ ## Minor notes
181
+ (clarifications, stylistic, optional improvements)
182
+
183
+ ## Consensus check
184
+ (which ExplainEntry headings look correct? name them explicitly)
185
+
186
+ Rules:
187
+ - Cite specific ExplainEntry heading names. Be concrete.
188
+ - Do NOT produce new numbers. Only critique.
189
+ - If you don't know, say so. Do not hallucinate.
190
+ - All your output must be tagged as a second opinion, NOT authoritative."""
191
+
192
+
193
+ def _prompt_zh(trace: str) -> str:
194
+ return f"""下面是工具产出的一份完整推导链。请审计。
195
+
196
+ <DERIVATION_TRACE>
197
+ {trace}
198
+ </DERIVATION_TRACE>
199
+
200
+ 按下面结构回复。没内容的段落写"无"。
201
+
202
+ ## 关键错误
203
+ (数学错误或公式错误 —— 会导致最终答案错)
204
+
205
+ ## 中度疑虑
206
+ (不合理假设、因子偏差 2x+、遗漏的 TP 分摊等)
207
+
208
+ ## 次要备注
209
+ (澄清、风格、可选改进)
210
+
211
+ ## 一致性核查
212
+ (哪些 ExplainEntry 标题看起来是对的?明确列出)
213
+
214
+ 规则:
215
+ - 必须引用具体的 ExplainEntry 标题名。具体点。
216
+ - 不要产出新数字,只做评论。
217
+ - 不确定的地方直说。不要编造。
218
+ - 你的所有输出都只是 second opinion,不是权威答案。"""
src/llm_cal/model_source/__init__.py ADDED
File without changes
src/llm_cal/model_source/auth.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Token discovery + user-friendly auth error messages."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+
7
+
8
+ def get_hf_token() -> str | None:
9
+ """Read HF token from standard env vars.
10
+
11
+ `HF_TOKEN` wins over `HUGGING_FACE_HUB_TOKEN` for consistency with the
12
+ huggingface-cli default.
13
+ """
14
+ return os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
15
+
16
+
17
+ def get_modelscope_token() -> str | None:
18
+ return os.environ.get("MODELSCOPE_API_TOKEN") or os.environ.get("MODELSCOPE_TOKEN")
19
+
20
+
21
+ def hf_auth_error_message(model_id: str) -> str:
22
+ return (
23
+ f"Model '{model_id}' requires authentication (gated or private).\n"
24
+ "Set HF_TOKEN env var or run: huggingface-cli login"
25
+ )
26
+
27
+
28
+ def modelscope_auth_error_message(model_id: str) -> str:
29
+ # Chinese user-facing message — full-width punctuation is intentional.
30
+ return (
31
+ f"模型 '{model_id}' 需要登录(gated 或 私有)。\n"
32
+ "设置 MODELSCOPE_API_TOKEN 环境变量,或执行:modelscope login"
33
+ )
src/llm_cal/model_source/base.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ModelSource ABC — HF and ModelScope implement this."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from dataclasses import dataclass
7
+ from typing import Any
8
+
9
+
10
+ @dataclass(frozen=True)
11
+ class SiblingFile:
12
+ """One file in the model repo. `size` is bytes, or None if unknown."""
13
+
14
+ filename: str
15
+ size: int | None
16
+
17
+
18
+ @dataclass(frozen=True)
19
+ class ModelArtifact:
20
+ """The raw material a ModelSource returns.
21
+
22
+ We do NOT interpret anything here — interpretation lives in `architecture/`
23
+ and `weight_analyzer/`. This is the thin "fetch" layer.
24
+ """
25
+
26
+ source: str # "huggingface" | "modelscope"
27
+ model_id: str
28
+ commit_sha: str | None # HF provides this; used as cache key component
29
+ config: dict[str, Any] # parsed config.json
30
+ siblings: tuple[SiblingFile, ...] # all files in the repo
31
+
32
+
33
+ class ModelNotFoundError(Exception):
34
+ """Model id does not exist on this source."""
35
+
36
+
37
+ class AuthRequiredError(Exception):
38
+ """Model is gated / private — user must set a token."""
39
+
40
+
41
+ class SourceUnavailableError(Exception):
42
+ """Network error, timeout, rate limit, etc."""
43
+
44
+
45
+ class ModelSource(ABC):
46
+ """Abstract interface for HF / ModelScope / future sources."""
47
+
48
+ name: str # subclasses override
49
+
50
+ @abstractmethod
51
+ def fetch(self, model_id: str) -> ModelArtifact:
52
+ """Fetch config.json + siblings for the given model.
53
+
54
+ Raises:
55
+ ModelNotFoundError: 404.
56
+ AuthRequiredError: 401/403 (gated/private).
57
+ SourceUnavailableError: 429, 5xx, timeout, network down.
58
+ """
src/llm_cal/model_source/huggingface.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """HuggingFace source. Uses `huggingface_hub` for metadata + `httpx` for config fetch.
2
+
3
+ Anti-pattern warning: do NOT call `list_repo_files()` then head-request each file.
4
+ Always use `model_info(files_metadata=True)` which returns all sibling sizes in
5
+ ONE request. Verified in `tests/test_hf.py` by asserting HTTP call count.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ from typing import Any
12
+
13
+ import httpx
14
+ from huggingface_hub import HfApi
15
+ from huggingface_hub.utils import (
16
+ GatedRepoError,
17
+ HfHubHTTPError,
18
+ RepositoryNotFoundError,
19
+ )
20
+
21
+ from llm_cal.model_source.auth import get_hf_token, hf_auth_error_message
22
+ from llm_cal.model_source.base import (
23
+ AuthRequiredError,
24
+ ModelArtifact,
25
+ ModelNotFoundError,
26
+ ModelSource,
27
+ SiblingFile,
28
+ SourceUnavailableError,
29
+ )
30
+
31
+ _CONFIG_URL = "https://huggingface.co/{model_id}/resolve/{revision}/config.json"
32
+
33
+
34
+ class HuggingFaceSource(ModelSource):
35
+ name = "huggingface"
36
+
37
+ def __init__(self, endpoint: str | None = None, timeout_s: float = 30.0) -> None:
38
+ # huggingface_hub picks up HF_ENDPOINT env; we pass through for explicitness
39
+ self._api = HfApi(endpoint=endpoint, token=get_hf_token())
40
+ self._timeout_s = timeout_s
41
+ self._endpoint = endpoint or "https://huggingface.co"
42
+
43
+ def fetch(self, model_id: str) -> ModelArtifact:
44
+ token = get_hf_token()
45
+
46
+ # Step 1: siblings + commit sha in ONE request.
47
+ # CRITICAL: files_metadata=True — see module docstring.
48
+ try:
49
+ info = self._api.model_info(
50
+ repo_id=model_id,
51
+ files_metadata=True,
52
+ token=token,
53
+ )
54
+ except RepositoryNotFoundError as e:
55
+ raise ModelNotFoundError(f"Model '{model_id}' not found on HuggingFace.") from e
56
+ except GatedRepoError as e:
57
+ raise AuthRequiredError(hf_auth_error_message(model_id)) from e
58
+ except HfHubHTTPError as e:
59
+ status = getattr(e.response, "status_code", None)
60
+ if status in (401, 403):
61
+ raise AuthRequiredError(hf_auth_error_message(model_id)) from e
62
+ if status == 429:
63
+ retry = e.response.headers.get("Retry-After", "unknown")
64
+ raise SourceUnavailableError(
65
+ f"HuggingFace rate limit (429). Retry-After: {retry}s. "
66
+ "Setting HF_TOKEN increases your quota."
67
+ ) from e
68
+ raise SourceUnavailableError(f"HuggingFace error ({status}): {e}") from e
69
+ except (httpx.TimeoutException, TimeoutError) as e:
70
+ raise SourceUnavailableError(
71
+ f"HuggingFace request timed out after {self._timeout_s}s."
72
+ ) from e
73
+
74
+ siblings = tuple(
75
+ SiblingFile(filename=s.rfilename, size=s.size) for s in (info.siblings or [])
76
+ )
77
+ commit_sha = info.sha
78
+
79
+ # Step 2: fetch config.json. If commit sha is available, pin to it so we don't
80
+ # race with repo updates between the two calls.
81
+ config = self._fetch_config(model_id, commit_sha or "main", token)
82
+
83
+ return ModelArtifact(
84
+ source=self.name,
85
+ model_id=model_id,
86
+ commit_sha=commit_sha,
87
+ config=config,
88
+ siblings=siblings,
89
+ )
90
+
91
+ def _fetch_config(self, model_id: str, revision: str, token: str | None) -> dict[str, Any]:
92
+ url = _CONFIG_URL.format(model_id=model_id, revision=revision)
93
+ headers = {"Authorization": f"Bearer {token}"} if token else {}
94
+ try:
95
+ resp = httpx.get(url, headers=headers, timeout=self._timeout_s, follow_redirects=True)
96
+ except (httpx.TimeoutException, httpx.ConnectError) as e:
97
+ raise SourceUnavailableError(f"config.json fetch failed: {e}") from e
98
+
99
+ if resp.status_code == 404:
100
+ raise ModelNotFoundError(
101
+ f"Model '{model_id}' exists but has no config.json. "
102
+ "May be a GGUF-only or dataset repo (not supported in v0.1)."
103
+ )
104
+ if resp.status_code in (401, 403):
105
+ raise AuthRequiredError(hf_auth_error_message(model_id))
106
+ if resp.status_code == 429:
107
+ retry = resp.headers.get("Retry-After", "unknown")
108
+ raise SourceUnavailableError(f"HuggingFace rate limit (429). Retry-After: {retry}s.")
109
+ if resp.status_code >= 400:
110
+ raise SourceUnavailableError(f"config.json fetch returned HTTP {resp.status_code}")
111
+
112
+ try:
113
+ parsed: dict[str, Any] = json.loads(resp.text)
114
+ except json.JSONDecodeError as e:
115
+ raise SourceUnavailableError(
116
+ f"config.json is not valid JSON (line {e.lineno} col {e.colno}): {e.msg}"
117
+ ) from e
118
+ return parsed
src/llm_cal/model_source/modelscope.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ModelScope source — REST-only via httpx.
2
+
3
+ Decision: Option B from ADR-001. We don't need the official `modelscope` SDK
4
+ because llm-cal only requires three things:
5
+ 1. List repo files + sizes (one API call)
6
+ 2. Fetch config.json (one API call)
7
+ 3. Range-GET a safetensors header (handled by safetensors_reader)
8
+
9
+ The SDK pulls heavy ML deps by default (torch / tf for some install paths).
10
+ REST keeps the install footprint flat, mirrors the existing httpx hot path,
11
+ and gives us identical exception semantics across HF + MS.
12
+
13
+ Endpoints (verified against modelscope.cn public docs, 2026-04):
14
+ * GET /api/v1/models/{owner}/{name} — model meta
15
+ * GET /api/v1/models/{owner}/{name}/repo/files?Recursive=true
16
+ — file tree + sizes
17
+ * GET /api/v1/models/{owner}/{name}/repo?FilePath=...&Revision=...
18
+ — raw file content
19
+
20
+ ModelScope wraps every response in a {Code, Message, Data, Success} envelope.
21
+ Field casing is PascalCase. We parse defensively — fields may evolve.
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import json
27
+ from typing import Any
28
+
29
+ import httpx
30
+
31
+ from llm_cal.model_source.auth import (
32
+ get_modelscope_token,
33
+ modelscope_auth_error_message,
34
+ )
35
+ from llm_cal.model_source.base import (
36
+ AuthRequiredError,
37
+ ModelArtifact,
38
+ ModelNotFoundError,
39
+ ModelSource,
40
+ SiblingFile,
41
+ SourceUnavailableError,
42
+ )
43
+
44
+ DEFAULT_ENDPOINT = "https://www.modelscope.cn"
45
+ DEFAULT_REVISION = "master"
46
+
47
+ _INFO_PATH = "/api/v1/models/{model_id}"
48
+ _FILES_PATH = "/api/v1/models/{model_id}/repo/files"
49
+ _RAW_PATH = "/api/v1/models/{model_id}/repo"
50
+
51
+
52
+ class ModelScopeSource(ModelSource):
53
+ name = "modelscope"
54
+
55
+ def __init__(
56
+ self,
57
+ endpoint: str | None = None,
58
+ timeout_s: float = 30.0,
59
+ revision: str = DEFAULT_REVISION,
60
+ ) -> None:
61
+ self._endpoint = (endpoint or DEFAULT_ENDPOINT).rstrip("/")
62
+ self._timeout_s = timeout_s
63
+ self._revision = revision
64
+
65
+ def fetch(self, model_id: str) -> ModelArtifact:
66
+ token = get_modelscope_token()
67
+ headers = self._auth_headers(token)
68
+
69
+ # Step 1: model info — gives us LatestSha (commit pin) when available.
70
+ # We tolerate missing info; fall back to revision="master" so that the
71
+ # file list + config calls still work.
72
+ commit_sha = self._fetch_commit_sha(model_id, headers)
73
+
74
+ # Step 2: file tree with sizes. ONE call, recursive, includes sub-folders.
75
+ siblings = self._list_files(model_id, commit_sha or self._revision, headers)
76
+
77
+ # Step 3: config.json. Pin to the commit sha when we have it so two
78
+ # back-to-back calls don't race against a repo update.
79
+ config = self._fetch_config(model_id, commit_sha or self._revision, headers)
80
+
81
+ return ModelArtifact(
82
+ source=self.name,
83
+ model_id=model_id,
84
+ commit_sha=commit_sha,
85
+ config=config,
86
+ siblings=siblings,
87
+ )
88
+
89
+ # ------------------------------------------------------------------ helpers
90
+
91
+ def _auth_headers(self, token: str | None) -> dict[str, str]:
92
+ return {"Authorization": f"Bearer {token}"} if token else {}
93
+
94
+ def _fetch_commit_sha(self, model_id: str, headers: dict[str, str]) -> str | None:
95
+ url = f"{self._endpoint}{_INFO_PATH.format(model_id=model_id)}"
96
+ try:
97
+ resp = httpx.get(
98
+ url, headers=headers, timeout=self._timeout_s, follow_redirects=True
99
+ )
100
+ except (httpx.TimeoutException, httpx.ConnectError, httpx.HTTPError):
101
+ # Soft fail — commit sha is best-effort. Caller will use "master".
102
+ return None
103
+
104
+ if resp.status_code != 200:
105
+ return None
106
+ try:
107
+ payload = resp.json()
108
+ except json.JSONDecodeError:
109
+ return None
110
+
111
+ data = payload.get("Data") if isinstance(payload, dict) else None
112
+ if not isinstance(data, dict):
113
+ return None
114
+ # Field name has bounced between LatestSha / latest_sha / Revision in
115
+ # historical docs; check several.
116
+ for key in ("LatestSha", "latest_sha", "Revision", "Sha"):
117
+ v = data.get(key)
118
+ if isinstance(v, str) and v:
119
+ return v
120
+ return None
121
+
122
+ def _list_files(
123
+ self, model_id: str, revision: str, headers: dict[str, str]
124
+ ) -> tuple[SiblingFile, ...]:
125
+ url = f"{self._endpoint}{_FILES_PATH.format(model_id=model_id)}"
126
+ params = {"Recursive": "true", "Revision": revision}
127
+ try:
128
+ resp = httpx.get(
129
+ url,
130
+ headers=headers,
131
+ params=params,
132
+ timeout=self._timeout_s,
133
+ follow_redirects=True,
134
+ )
135
+ except (httpx.TimeoutException, httpx.ConnectError) as e:
136
+ raise SourceUnavailableError(f"ModelScope file list failed: {e}") from e
137
+
138
+ self._raise_for_status(resp, model_id, what="file list")
139
+
140
+ try:
141
+ payload = resp.json()
142
+ except json.JSONDecodeError as e:
143
+ raise SourceUnavailableError(
144
+ f"ModelScope file list returned non-JSON: {e}"
145
+ ) from e
146
+
147
+ files = _extract_files(payload)
148
+ if files is None:
149
+ raise SourceUnavailableError(
150
+ "ModelScope file list payload had unexpected shape — "
151
+ "neither Data.Files nor Data is a list."
152
+ )
153
+ return tuple(
154
+ SiblingFile(filename=f["Path"], size=f.get("Size"))
155
+ for f in files
156
+ if isinstance(f, dict) and isinstance(f.get("Path"), str)
157
+ # Only include blobs (not directories). Type=tree means folder.
158
+ and f.get("Type", "blob") != "tree"
159
+ )
160
+
161
+ def _fetch_config(
162
+ self, model_id: str, revision: str, headers: dict[str, str]
163
+ ) -> dict[str, Any]:
164
+ url = f"{self._endpoint}{_RAW_PATH.format(model_id=model_id)}"
165
+ params = {"FilePath": "config.json", "Revision": revision}
166
+ try:
167
+ resp = httpx.get(
168
+ url,
169
+ headers=headers,
170
+ params=params,
171
+ timeout=self._timeout_s,
172
+ follow_redirects=True,
173
+ )
174
+ except (httpx.TimeoutException, httpx.ConnectError) as e:
175
+ raise SourceUnavailableError(f"config.json fetch failed: {e}") from e
176
+
177
+ self._raise_for_status(resp, model_id, what="config.json")
178
+
179
+ try:
180
+ parsed: Any = json.loads(resp.text)
181
+ except json.JSONDecodeError as e:
182
+ raise SourceUnavailableError(
183
+ f"config.json is not valid JSON (line {e.lineno} col {e.colno}): {e.msg}"
184
+ ) from e
185
+ if not isinstance(parsed, dict):
186
+ raise SourceUnavailableError(
187
+ "config.json did not parse to a JSON object."
188
+ )
189
+ return parsed
190
+
191
+ def _raise_for_status(
192
+ self, resp: httpx.Response, model_id: str, what: str
193
+ ) -> None:
194
+ if resp.status_code == 200:
195
+ return
196
+ if resp.status_code == 404:
197
+ raise ModelNotFoundError(
198
+ f"Model '{model_id}' not found on ModelScope ({what})."
199
+ )
200
+ if resp.status_code in (401, 403):
201
+ raise AuthRequiredError(modelscope_auth_error_message(model_id))
202
+ if resp.status_code == 429:
203
+ retry = resp.headers.get("Retry-After", "unknown")
204
+ raise SourceUnavailableError(
205
+ f"ModelScope rate limit (429). Retry-After: {retry}s. "
206
+ "Setting MODELSCOPE_API_TOKEN increases your quota."
207
+ )
208
+ raise SourceUnavailableError(
209
+ f"ModelScope {what} returned HTTP {resp.status_code}"
210
+ )
211
+
212
+
213
+ def _extract_files(payload: Any) -> list[Any] | None:
214
+ """Pull the file list out of the wrapped ModelScope envelope.
215
+
216
+ Tolerates two known shapes:
217
+ A) {Data: {Files: [...]}} — most common
218
+ B) {Data: [...]} — older / list-only endpoints
219
+ """
220
+ if not isinstance(payload, dict):
221
+ return None
222
+ data = payload.get("Data")
223
+ if isinstance(data, dict):
224
+ files = data.get("Files")
225
+ if isinstance(files, list):
226
+ return files
227
+ if isinstance(data, list):
228
+ return data
229
+ return None
src/llm_cal/output/__init__.py ADDED
File without changes
src/llm_cal/output/formatter.py ADDED
@@ -0,0 +1,665 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Rich-formatted, fully i18n'd output for EvaluationReport.
2
+
3
+ Every visible string flows through `common.i18n.t()`. To add another locale,
4
+ add entries to `_MESSAGES` in i18n.py; no changes here needed.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Any
10
+
11
+ from rich.console import Console
12
+ from rich.panel import Panel
13
+ from rich.table import Table
14
+ from rich.text import Text
15
+
16
+ from llm_cal.common.i18n import get_locale, t
17
+ from llm_cal.core.evaluator import EvaluationReport
18
+ from llm_cal.engine_compat.loader import EngineCompatEntry, EngineFlag, EngineSource
19
+ from llm_cal.fleet.planner import FleetRecommendation
20
+ from llm_cal.hardware.loader import GPUDatabase
21
+ from llm_cal.output.labels import AnnotatedValue, Label
22
+
23
+ _LABEL_STYLES: dict[Label, str] = {
24
+ Label.VERIFIED: "bold green",
25
+ Label.INFERRED: "cyan",
26
+ Label.ESTIMATED: "yellow",
27
+ Label.CITED: "blue",
28
+ Label.UNVERIFIED: "bold yellow",
29
+ Label.UNKNOWN: "dim red",
30
+ Label.LLM_OPINION: "magenta",
31
+ }
32
+
33
+
34
+ def format_tag(av: AnnotatedValue[Any]) -> Text:
35
+ style = _LABEL_STYLES.get(av.label, "white")
36
+ display = t(f"label.{av.label.value}") # localized; falls back to English
37
+ return Text(f"[{display}]", style=style)
38
+
39
+
40
+ def _fmt_bytes(n: int) -> str:
41
+ if n >= 1_000_000_000:
42
+ return f"{n / 1_000_000_000:.2f} GB"
43
+ if n >= 1_000_000:
44
+ return f"{n / 1_000_000:.2f} MB"
45
+ if n >= 1_000:
46
+ return f"{n / 1_000:.2f} KB"
47
+ return f"{n} B"
48
+
49
+
50
+ def _fmt_params(n: int) -> str:
51
+ if n >= 1_000_000_000:
52
+ return f"{n / 1_000_000_000:.2f}B"
53
+ if n >= 1_000_000:
54
+ return f"{n / 1_000_000:.2f}M"
55
+ return str(n)
56
+
57
+
58
+ def render(report: EvaluationReport, console: Console | None = None) -> None:
59
+ console = console or Console()
60
+
61
+ console.print()
62
+ sha_frag = f" @ {report.commit_sha[:7]}" if report.commit_sha else ""
63
+ console.print(
64
+ Panel.fit(
65
+ f"[bold cyan]{report.model_id}[/bold cyan] "
66
+ f"[dim]{t('panel.via')} {report.source}{sha_frag}[/dim]",
67
+ border_style="cyan",
68
+ )
69
+ )
70
+
71
+ _render_architecture(report, console)
72
+ _render_weight(report, console)
73
+ _render_kv_cache(report, console)
74
+ _render_engine_compat(report, console)
75
+ _render_hardware(report, console)
76
+ _render_fleet(report, console)
77
+ _render_performance(report, console)
78
+ _render_command(report, console)
79
+ _render_label_legend(console)
80
+
81
+
82
+ def _render_architecture(report: EvaluationReport, console: Console) -> None:
83
+ p = report.profile
84
+ table = Table(title=t("section.architecture"), show_header=False, box=None, padding=(0, 2))
85
+ table.add_column("field", style="dim")
86
+ table.add_column("value")
87
+ table.add_column("label")
88
+
89
+ table.add_row(t("arch.model_type"), p.model_type or t("arch.none"), _verified_tag())
90
+ table.add_row(t("arch.family"), p.family.value, _verified_tag())
91
+ table.add_row(
92
+ t("arch.confidence"), p.confidence.value, Text(f"[{p.confidence.value}]", style="magenta")
93
+ )
94
+ table.add_row(t("arch.layers"), str(p.num_hidden_layers), _verified_tag())
95
+ table.add_row(t("arch.hidden_size"), str(p.hidden_size), _verified_tag())
96
+ table.add_row(t("arch.vocab_size"), f"{p.vocab_size:,}", _verified_tag())
97
+
98
+ if p.attention is not None:
99
+ table.add_row(
100
+ t("arch.attention"),
101
+ t(
102
+ "arch.attn_summary",
103
+ variant=p.attention.variant,
104
+ heads=p.attention.num_heads,
105
+ kv_heads=p.attention.num_kv_heads,
106
+ head_dim=p.attention.head_dim,
107
+ ),
108
+ _verified_tag(),
109
+ )
110
+ if p.attention.compress_ratios:
111
+ ratios = p.attention.compress_ratios
112
+ table.add_row(
113
+ t("arch.compress_ratios"),
114
+ t(
115
+ "arch.compress_ratios_summary",
116
+ n=len(ratios),
117
+ dense=sum(1 for r in ratios if r == 0),
118
+ ),
119
+ _verified_tag(),
120
+ )
121
+ if p.moe is not None:
122
+ table.add_row(
123
+ t("arch.moe"),
124
+ t(
125
+ "arch.moe_summary",
126
+ routed=p.moe.num_routed_experts,
127
+ shared=p.moe.num_shared_experts,
128
+ topk=p.moe.num_experts_per_tok,
129
+ ),
130
+ _verified_tag(),
131
+ )
132
+ if p.sliding_window:
133
+ table.add_row(t("arch.sliding_window"), str(p.sliding_window), _verified_tag())
134
+ if p.position and p.position.max_position_embeddings:
135
+ table.add_row(
136
+ t("arch.max_position"),
137
+ f"{p.position.max_position_embeddings:,}",
138
+ _verified_tag(),
139
+ )
140
+
141
+ console.print(table)
142
+ if p.auxiliary.get("warning"):
143
+ console.print(f"[red]⚠ {p.auxiliary['warning']}[/red]")
144
+ if p.auxiliary.get("v0_1_unsupported"):
145
+ console.print(f"[yellow]⚠ {t('arch.unsupported_state_space')}[/yellow]")
146
+
147
+
148
+ def _render_weight(report: EvaluationReport, console: Console) -> None:
149
+ table = Table(title=t("section.weights"), show_header=False, box=None, padding=(0, 2))
150
+ table.add_column("field", style="dim")
151
+ table.add_column("value")
152
+ table.add_column("label")
153
+
154
+ w = report.weight
155
+ table.add_row(
156
+ t("weights.safetensors_bytes"),
157
+ _fmt_bytes(w.total_bytes.value),
158
+ format_tag(w.total_bytes),
159
+ )
160
+ table.add_row(
161
+ t("weights.params_estimated"),
162
+ _fmt_params(report.total_params_estimate.value),
163
+ format_tag(report.total_params_estimate),
164
+ )
165
+ if w.bits_per_param is not None:
166
+ table.add_row(
167
+ t("weights.bits_per_param"),
168
+ f"{w.bits_per_param.value:.2f}",
169
+ format_tag(w.bits_per_param),
170
+ )
171
+ table.add_row(
172
+ t("weights.quant_guess"),
173
+ str(w.quantization_guess.value),
174
+ format_tag(w.quantization_guess),
175
+ )
176
+ console.print(table)
177
+
178
+ r = report.reconciliation
179
+ if r.candidates:
180
+ rec_table = Table(
181
+ title=t("section.reconciliation"),
182
+ title_justify="left",
183
+ show_header=True,
184
+ header_style="dim",
185
+ box=None,
186
+ padding=(0, 2),
187
+ )
188
+ rec_table.add_column(t("recon.scheme"))
189
+ rec_table.add_column(t("recon.predicted"), justify="right")
190
+ rec_table.add_column(t("recon.delta"), justify="right")
191
+ rec_table.add_column(t("recon.error_pct"), justify="right")
192
+ for c in r.candidates[:6]:
193
+ direction = t("recon.over") if c.delta_bytes > 0 else t("recon.under")
194
+ rec_table.add_row(
195
+ c.scheme,
196
+ _fmt_bytes(c.predicted_bytes),
197
+ f"{_fmt_bytes(abs(c.delta_bytes))} {direction}",
198
+ f"{c.relative_error * 100:.1f}%",
199
+ )
200
+ console.print(rec_table)
201
+ console.print(f"[bold]{t('recon.best')}[/bold] {r.best.value} {format_tag(r.best)}")
202
+
203
+
204
+ def _render_kv_cache(report: EvaluationReport, console: Console) -> None:
205
+ if not report.kv_cache_by_context:
206
+ return
207
+ table = Table(
208
+ title=t("section.kv_cache"),
209
+ title_justify="left",
210
+ show_header=True,
211
+ header_style="dim",
212
+ box=None,
213
+ padding=(0, 2),
214
+ )
215
+ table.add_column(t("kv.context"))
216
+ table.add_column(t("kv.kv_cache"), justify="right")
217
+ table.add_column(t("kv.label"))
218
+ tokens_word = t("kv.tokens")
219
+ for ctx, av in report.kv_cache_by_context.items():
220
+ table.add_row(
221
+ f"{ctx:,} {tokens_word}",
222
+ _fmt_bytes(av.value),
223
+ format_tag(av),
224
+ )
225
+ console.print(table)
226
+
227
+
228
+ def _render_engine_compat(report: EvaluationReport, console: Console) -> None:
229
+ m = report.engine_match
230
+ if m is None:
231
+ console.print()
232
+ console.print(
233
+ f"[dim]{t('section.engine_compat')}:[/dim] [yellow]{t('engine.no_match')}[/yellow]"
234
+ )
235
+ return
236
+
237
+ table = Table(
238
+ title=f"{t('section.engine_compat')} — {m.engine}",
239
+ show_header=False,
240
+ box=None,
241
+ padding=(0, 2),
242
+ )
243
+ table.add_column("field", style="dim")
244
+ table.add_column("value")
245
+ table.add_column("label")
246
+
247
+ verif_label = _verif_label(m)
248
+ table.add_row(t("engine.version_spec"), m.version_spec, Text(""))
249
+ table.add_row(t("engine.support"), m.support, verif_label)
250
+ table.add_row(t("engine.verification"), m.verification_level, verif_label)
251
+
252
+ if m.required_flags:
253
+ lines = [_fmt_flag(f) for f in m.required_flags]
254
+ table.add_row(t("engine.required_flags"), "\n".join(lines), Text(""))
255
+ if m.optional_flags:
256
+ lines = [_fmt_flag(f) for f in m.optional_flags]
257
+ table.add_row(t("engine.optional_flags"), "\n".join(lines), Text(""))
258
+
259
+ caveats = m.caveats_zh if get_locale() == "zh" else m.caveats_en
260
+ if caveats:
261
+ table.add_row(t("engine.caveats"), "\n".join(f"• {c}" for c in caveats), Text(""))
262
+
263
+ if m.sources:
264
+ source_lines = [_fmt_source(s) for s in m.sources]
265
+ table.add_row(t("engine.sources"), "\n".join(source_lines), Text(""))
266
+
267
+ console.print(table)
268
+
269
+
270
+ def _render_hardware(report: EvaluationReport, console: Console) -> None:
271
+ console.print()
272
+ if report.gpu_spec is None:
273
+ msg = report.gpu_error or f"Unknown GPU '{report.gpu}'"
274
+ console.print(f"[bold red]{t('section.hardware')}:[/bold red] [red]{msg}[/red]")
275
+ return
276
+
277
+ spec = report.gpu_spec
278
+ locale = get_locale()
279
+ table = Table(
280
+ title=f"{t('section.hardware')} — {spec.id}",
281
+ show_header=False,
282
+ box=None,
283
+ padding=(0, 2),
284
+ )
285
+ table.add_column("field", style="dim")
286
+ table.add_column("value")
287
+
288
+ table.add_row(t("hw.memory"), f"{spec.memory_gb} GB HBM")
289
+ table.add_row(t("hw.nvlink_bandwidth"), f"{spec.nvlink_bandwidth_gbps} GB/s")
290
+ table.add_row(t("hw.fp16_tflops"), f"{spec.fp16_tflops:.0f} TFLOPS")
291
+ table.add_row(t("hw.fp8_support"), t("hw.bool_yes") if spec.fp8_support else t("hw.bool_no"))
292
+ table.add_row(t("hw.fp4_support"), t("hw.bool_yes") if spec.fp4_support else t("hw.bool_no"))
293
+ notes = spec.localized_notes(locale)
294
+ if notes:
295
+ table.add_row(t("hw.notes"), notes)
296
+ if spec.spec_source:
297
+ table.add_row(t("hw.spec_source"), spec.spec_source)
298
+ console.print(table)
299
+
300
+
301
+ def _render_fleet(report: EvaluationReport, console: Console) -> None:
302
+ f = report.fleet
303
+ if f is None:
304
+ if report.gpu_spec is None:
305
+ return # hardware section already surfaced the error
306
+ console.print(f"[dim]{t('fleet.gpu_spec_unknown')}[/dim]")
307
+ return
308
+
309
+ # Decide which context lengths to surface as concurrency columns.
310
+ ctx_cols = _select_concurrency_columns(f)
311
+
312
+ table = Table(
313
+ title=f"{t('section.fleet')} — {report.gpu_spec.id if report.gpu_spec else report.gpu}",
314
+ title_justify="left",
315
+ show_header=True,
316
+ header_style="dim",
317
+ box=None,
318
+ padding=(0, 2),
319
+ )
320
+ table.add_column(t("fleet.col.tier"))
321
+ table.add_column(t("fleet.col.gpus"), justify="right")
322
+ table.add_column(t("fleet.col.weight_per_gpu"), justify="right")
323
+ table.add_column(t("fleet.col.headroom_per_gpu"), justify="right")
324
+ for ctx in ctx_cols:
325
+ table.add_column(
326
+ t("fleet.col.concurrent_at_ctx", ctx=_fmt_ctx(ctx)),
327
+ justify="right",
328
+ )
329
+
330
+ for opt in f.options:
331
+ headroom = opt.usable_bytes_per_gpu - opt.weight_bytes_per_gpu
332
+ label_tier = t(f"fleet.tier.{opt.tier}")
333
+ marker = " ★" if opt.tier == f.best_tier else ""
334
+ row_style = None if opt.fits else "dim red"
335
+ conc_map = dict(opt.max_concurrent_by_context)
336
+ row = [
337
+ f"{label_tier}{marker}",
338
+ str(opt.gpu_count),
339
+ _fmt_bytes(opt.weight_bytes_per_gpu),
340
+ _fmt_bytes(headroom) if headroom > 0 else "—",
341
+ ]
342
+ for ctx in ctx_cols:
343
+ n = conc_map.get(ctx, 0)
344
+ row.append(f"~{n}" if n > 0 else "✗")
345
+ table.add_row(*row, style=row_style)
346
+
347
+ console.print(table)
348
+
349
+ locale = get_locale()
350
+ note = f.constraint_note_zh if locale == "zh" else f.constraint_note_en
351
+ console.print(f"[dim]{t('fleet.constraint')} {note}[/dim]")
352
+ console.print(f"[dim]★ {t('fleet.best_marker')}[/dim]")
353
+
354
+
355
+ def _select_concurrency_columns(f: FleetRecommendation) -> list[int]:
356
+ """Pick which context lengths become concurrency columns in the fleet table.
357
+
358
+ Rule: always include 128K if the model supports it; additionally include the
359
+ model's max context if it's larger than 128K. For shorter-context models,
360
+ fall back to 32K or whatever the max is.
361
+ """
362
+ all_ctxs: set[int] = set()
363
+ for opt in f.options:
364
+ for ctx, _ in opt.max_concurrent_by_context:
365
+ all_ctxs.add(ctx)
366
+ if not all_ctxs:
367
+ return []
368
+ picks: list[int] = []
369
+ if 131_072 in all_ctxs:
370
+ picks.append(131_072)
371
+ max_ctx = max(all_ctxs)
372
+ if max_ctx > 131_072 and max_ctx not in picks:
373
+ picks.append(max_ctx)
374
+ if not picks:
375
+ picks.append(32_768 if 32_768 in all_ctxs else max_ctx)
376
+ return picks
377
+
378
+
379
+ def _fmt_ctx(ctx_tokens: int) -> str:
380
+ if ctx_tokens >= 1_000_000:
381
+ if ctx_tokens % 1_000_000 == 0:
382
+ return f"{ctx_tokens // 1_000_000}M"
383
+ return f"{ctx_tokens / 1_000_000:.1f}M"
384
+ if ctx_tokens >= 1024:
385
+ return f"{ctx_tokens // 1024}K"
386
+ return str(ctx_tokens)
387
+
388
+
389
+ def _render_performance(report: EvaluationReport, console: Console) -> None:
390
+ if (
391
+ report.prefill is None
392
+ or report.decode is None
393
+ or report.concurrency is None
394
+ or report.perf_input_tokens is None
395
+ or report.perf_target_tokens_per_sec is None
396
+ ):
397
+ return
398
+
399
+ console.print()
400
+ # Assumption banner — surfaces the utilization factors, SLA, and
401
+ # degradation factor. Every number in the performance section depends
402
+ # on these.
403
+ assumptions = t(
404
+ "perf.assumptions_note",
405
+ input_tokens=report.perf_input_tokens,
406
+ output_tokens=report.perf_output_tokens,
407
+ target_tps=report.perf_target_tokens_per_sec,
408
+ prefill_util=report.prefill.utilization,
409
+ decode_util=report.decode.bw_utilization,
410
+ degradation=report.concurrency.degradation_factor,
411
+ )
412
+ console.print(f"[dim italic]{assumptions}[/dim italic]")
413
+
414
+ table = Table(
415
+ title=t("section.performance"),
416
+ title_justify="left",
417
+ show_header=False,
418
+ box=None,
419
+ padding=(0, 2),
420
+ )
421
+ table.add_column("field", style="dim")
422
+ table.add_column("value")
423
+ table.add_column("label")
424
+
425
+ p = report.prefill
426
+ d = report.decode
427
+ c = report.concurrency
428
+
429
+ table.add_row(
430
+ t("perf.prefill_latency"),
431
+ f"{p.latency_ms.value:.1f} ms",
432
+ format_tag(p.latency_ms),
433
+ )
434
+ table.add_row(
435
+ t("perf.decode_throughput_per_gpu"),
436
+ f"{d.per_gpu_tokens_per_sec.value:.1f} tok/s",
437
+ format_tag(d.per_gpu_tokens_per_sec),
438
+ )
439
+ table.add_row(
440
+ t("perf.decode_throughput_cluster"),
441
+ f"{d.cluster_tokens_per_sec.value:.1f} tok/s",
442
+ format_tag(d.cluster_tokens_per_sec),
443
+ )
444
+ if d.moe_active_tokens_per_sec is not None:
445
+ table.add_row(
446
+ t("perf.decode_moe_active_optimistic"),
447
+ f"{d.moe_active_tokens_per_sec.value:.1f} tok/s",
448
+ format_tag(d.moe_active_tokens_per_sec),
449
+ )
450
+ table.add_row(
451
+ t("perf.k_bound"),
452
+ str(c.k_bound.value),
453
+ format_tag(c.k_bound),
454
+ )
455
+ table.add_row(
456
+ t("perf.l_bound"),
457
+ str(c.l_bound.value),
458
+ format_tag(c.l_bound),
459
+ )
460
+ table.add_row(
461
+ t("perf.max_concurrent"),
462
+ str(c.max_concurrent.value),
463
+ format_tag(c.max_concurrent),
464
+ )
465
+ bottleneck_label = t(f"perf.bottleneck.{c.bottleneck}")
466
+ locale = get_locale()
467
+ reason = c.bottleneck_reason_zh if locale == "zh" else c.bottleneck_reason_en
468
+ table.add_row(
469
+ t("perf.bottleneck"),
470
+ f"{bottleneck_label} — {reason}",
471
+ Text(""),
472
+ )
473
+ console.print(table)
474
+
475
+ # Always show a short optimization list. Rules are currently static but
476
+ # future versions can pick per bottleneck type.
477
+ console.print(f"[bold]{t('perf.optimization.header')}:[/bold]")
478
+ for key in (
479
+ "perf.opt.quantize_int4",
480
+ "perf.opt.relax_sla",
481
+ "perf.opt.kv_fp8",
482
+ "perf.opt.moe_offload",
483
+ ):
484
+ console.print(f" • {t(key)}")
485
+
486
+
487
+ def _render_command(report: EvaluationReport, console: Console) -> None:
488
+ if not report.generated_command or report.fleet is None:
489
+ return
490
+ # Figure out which tier we emitted the command for.
491
+ best_tier_opt = next(
492
+ (o for o in report.fleet.options if o.tier == report.fleet.best_tier),
493
+ report.fleet.options[0],
494
+ )
495
+ tier_label = t(f"fleet.tier.{best_tier_opt.tier}")
496
+ header_note = t("command.tier_note", tier=tier_label, gpus=best_tier_opt.gpu_count)
497
+ console.print()
498
+ console.print(
499
+ Panel(
500
+ report.generated_command,
501
+ title=f"{t('section.command')} — {header_note}",
502
+ title_align="left",
503
+ border_style="green",
504
+ )
505
+ )
506
+
507
+
508
+ def _render_label_legend(console: Console) -> None:
509
+ legend = Text()
510
+ legend.append(f"{t('section.labels')} ", style="dim")
511
+ for label in Label:
512
+ display = t(f"label.{label.value}")
513
+ legend.append(f"[{display}] ", style=_LABEL_STYLES.get(label, "white"))
514
+ console.print(legend)
515
+
516
+
517
+ def _verified_tag() -> Text:
518
+ return Text(f"[{t('label.verified')}]", style=_LABEL_STYLES[Label.VERIFIED])
519
+
520
+
521
+ def render_llm_review(result: Any, console: Console | None = None) -> None:
522
+ """Render --llm-review block. Accepts an LLMReviewResult.
523
+
524
+ Failure is non-fatal — shows setup hint and continues.
525
+ """
526
+ console = console or Console()
527
+ console.print()
528
+ console.print(Panel.fit(t("section.llm_review"), border_style="magenta"))
529
+
530
+ if not result.ok:
531
+ msg = t("llm_review.unavailable", error=result.error or "unknown")
532
+ console.print(f"[yellow]{msg}[/yellow]")
533
+ console.print(f"[dim]{t('llm_review.setup_hint')}[/dim]")
534
+ return
535
+
536
+ # Disclaimer first — make it visually distinctive so users don't confuse
537
+ # LLM opinion with the tool's own output.
538
+ disclaimer = t("llm_review.disclaimer", model=result.model, base_url=result.base_url)
539
+ console.print(f"[bold yellow]{disclaimer}[/bold yellow]")
540
+ console.print()
541
+ # The actual review, prefixed with the [llm-opinion] tag so users see
542
+ # it's tagged too.
543
+ tag_style = _LABEL_STYLES[Label.LLM_OPINION]
544
+ tag_display = t(f"label.{Label.LLM_OPINION.value}")
545
+ console.print(f"[{tag_style}][{tag_display}][/{tag_style}]")
546
+ # Print content verbatim (LLM output is markdown-ish; let it through).
547
+ console.print(result.content or "")
548
+
549
+
550
+ def render_explain(entries: list[Any], console: Console | None = None) -> None:
551
+ """Render `--explain` block: full derivation trace for each number.
552
+
553
+ `entries` is a list of `core.explain.ExplainEntry`.
554
+ """
555
+ console = console or Console()
556
+
557
+ console.print()
558
+ console.print(Panel.fit(t("section.explain"), border_style="magenta"))
559
+ console.print(f"[dim italic]{t('explain.intro')}[/dim italic]")
560
+ console.print()
561
+
562
+ for entry in entries:
563
+ # Title bar per entry
564
+ console.print(Panel.fit(f"[bold]{entry.heading}[/bold]", border_style="cyan"))
565
+
566
+ # Formula (monospace)
567
+ console.print(f"[bold]{t('explain.formula')}:[/bold]")
568
+ for line in entry.formula.splitlines():
569
+ console.print(f" [magenta]{line}[/magenta]")
570
+
571
+ # Inputs
572
+ if entry.inputs:
573
+ console.print(f"[bold]{t('explain.inputs')}:[/bold]")
574
+ for inp in entry.inputs:
575
+ note = f" [dim]({inp.note})[/dim]" if inp.note else ""
576
+ console.print(
577
+ f" [cyan]{inp.name}[/cyan] = {inp.value} [dim]{inp.label}[/dim]{note}"
578
+ )
579
+
580
+ # Steps
581
+ if entry.steps:
582
+ console.print(f"[bold]{t('explain.steps')}:[/bold]")
583
+ for step in entry.steps:
584
+ for line in step.splitlines():
585
+ console.print(f" {line}")
586
+
587
+ # Result
588
+ console.print(f"[bold]{t('explain.result')}:[/bold] {entry.result}")
589
+
590
+ # Source + methodology anchor
591
+ if entry.source:
592
+ console.print(f"[bold]{t('explain.source')}:[/bold] {entry.source}")
593
+ if entry.methodology_anchor:
594
+ console.print(
595
+ f"[dim]{t('explain.see_also')}: docs/methodology.md{entry.methodology_anchor}[/dim]"
596
+ )
597
+ console.print()
598
+
599
+
600
+ def render_gpu_list(db: GPUDatabase, console: Console | None = None) -> None:
601
+ """Print the supported-GPU table. Invoked by `llm-cal --list-gpus`."""
602
+ console = console or Console()
603
+ locale = get_locale()
604
+
605
+ table = Table(
606
+ title=t("gpus.list.title"),
607
+ title_justify="left",
608
+ show_header=True,
609
+ header_style="dim",
610
+ box=None,
611
+ padding=(0, 2),
612
+ )
613
+ table.add_column(t("gpus.col.id"))
614
+ table.add_column(t("gpus.col.memory"), justify="right")
615
+ table.add_column(t("gpus.col.nvlink"), justify="right")
616
+ table.add_column(t("gpus.col.fp16"), justify="right")
617
+ table.add_column(t("gpus.col.fp8"), justify="center")
618
+ table.add_column(t("gpus.col.fp4"), justify="center")
619
+ table.add_column(t("gpus.col.aliases"))
620
+
621
+ yes = t("hw.bool_yes")
622
+ no = t("hw.bool_no")
623
+
624
+ # Preserve YAML insertion order (vendors are grouped there).
625
+ for spec in db.gpus:
626
+ aliases_str = ", ".join(spec.aliases) if spec.aliases else "—"
627
+ nvlink_str = f"{spec.nvlink_bandwidth_gbps} GB/s" if spec.nvlink_bandwidth_gbps else "—"
628
+ table.add_row(
629
+ spec.id,
630
+ f"{spec.memory_gb} GB",
631
+ nvlink_str,
632
+ f"{spec.fp16_tflops:.0f}",
633
+ yes if spec.fp8_support else no,
634
+ yes if spec.fp4_support else no,
635
+ aliases_str,
636
+ )
637
+ console.print(table)
638
+ console.print(f"[dim]{t('gpus.total', count=len(db.gpus))}[/dim]")
639
+ _ = locale # suppress unused var warn until we add locale-dependent notes column
640
+
641
+
642
+ def _verif_label(entry: EngineCompatEntry) -> Text:
643
+ """Engine compat rows use the same label vocabulary as AnnotatedValue."""
644
+ label = {
645
+ "verified": Label.VERIFIED,
646
+ "cited": Label.CITED,
647
+ "unverified": Label.UNVERIFIED,
648
+ }.get(entry.verification_level, Label.UNKNOWN)
649
+ return Text(f"[{t(f'label.{label.value}')}]", style=_LABEL_STYLES.get(label, "white"))
650
+
651
+
652
+ def _fmt_flag(f: EngineFlag) -> str:
653
+ if f.value is None:
654
+ return f.flag
655
+ return f"{f.flag} {f.value}"
656
+
657
+
658
+ def _fmt_source(s: EngineSource) -> str:
659
+ label = t(f"source.{s.type}")
660
+ if s.type == "tested":
661
+ return f"[{label}] {s.tester} @ {s.hardware} ({s.date})"
662
+ if s.url:
663
+ captured = f" ({t('source.captured_on')} {s.captured_date})" if s.captured_date else ""
664
+ return f"[{label}] {s.url}{captured}"
665
+ return f"[{label}]"
src/llm_cal/output/labels.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """6-level label discipline — the soul of the tool.
2
+
3
+ Every number in the output must be wrapped in `AnnotatedValue` so users always know
4
+ where a value came from. Using `StrEnum` (not bare strings) means typos are caught by
5
+ mypy/ruff, not by users.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass
11
+ from enum import StrEnum
12
+ from typing import Generic, TypeVar
13
+
14
+
15
+ class Label(StrEnum):
16
+ VERIFIED = "verified"
17
+ INFERRED = "inferred"
18
+ ESTIMATED = "estimated"
19
+ CITED = "cited"
20
+ UNVERIFIED = "unverified"
21
+ UNKNOWN = "unknown"
22
+ # Experimental opt-in 7th level. Populated only when --llm-review is used.
23
+ # Never overrides the first 6 — it's an external second opinion, not truth.
24
+ LLM_OPINION = "llm-opinion"
25
+
26
+
27
+ T = TypeVar("T")
28
+
29
+
30
+ @dataclass(frozen=True)
31
+ class AnnotatedValue(Generic[T]):
32
+ """A value paired with provenance metadata.
33
+
34
+ Examples:
35
+ AnnotatedValue(160_300_000_000, Label.VERIFIED, source="HF model_info.siblings")
36
+ AnnotatedValue(4.52, Label.INFERRED, source="160.3 GB / 284B params")
37
+ AnnotatedValue(2_600_000_000, Label.ESTIMATED,
38
+ source="compress_ratios=[0,0,4,128,...] at 128K ctx")
39
+ """
40
+
41
+ value: T
42
+ label: Label
43
+ source: str | None = None
44
+
45
+ def render_tag(self) -> str:
46
+ return f"[{self.label.value}]"
src/llm_cal/performance/__init__.py ADDED
File without changes
src/llm_cal/performance/compute.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Performance modeling for prefill latency and decode throughput.
2
+
3
+ FORMULAS — with sources. See docs/methodology.md for the full audit.
4
+
5
+ Prefill (compute-bound):
6
+ FLOPs = 2 × params × input_tokens
7
+ latency = FLOPs / (peak_TFLOPS × num_gpus × utilization × 1e12)
8
+
9
+ Source: Kaplan et al. 2020, "Scaling Laws for Neural Language Models".
10
+ The "2" factor is the forward-pass cost per param per token, a standard
11
+ approximation in transformer inference literature.
12
+
13
+ Decode (memory-bandwidth-bound):
14
+ per_token_time = weight_bytes_per_gpu / (memory_bandwidth × utilization)
15
+ tokens_per_second = memory_bandwidth × utilization / weight_bytes_per_gpu
16
+
17
+ Source: Kwon et al. SOSP 2023 "Efficient Memory Management for Large
18
+ Language Model Serving with PagedAttention"; NVIDIA "Mastering LLM
19
+ Techniques: Inference Optimization" (2023 technical blog).
20
+
21
+ UTILIZATION FACTORS (all empirical, ALL user-overridable):
22
+ - Prefill 40% — midpoint of vLLM-reported 30-50% MFU on H100
23
+ - Decode BW 50% — midpoint of NVIDIA/vLLM-reported 40-65% achieved bandwidth
24
+ - Cluster comm 90% — typical NCCL AllReduce efficiency at TP=8 on NVLink
25
+ - Concurrency degradation 1.0 (no degradation by default)
26
+ This is the most uncertain factor. Prior versions defaulted to 1.5
27
+ (borrowed from an LLM-generated report), which was NOT from a primary
28
+ source. v0.1 defaults to 1.0 (honest baseline) and exposes the knob
29
+ so users can dial in whatever their engine actually achieves.
30
+
31
+ MoE "active" vs "total":
32
+ Strictly, MoE decode only reads the active experts per token. The
33
+ ratio used here is a rough approximation:
34
+ active_ratio ≈ (experts_per_tok + shared_experts) / (routed + shared)
35
+ This UNDERESTIMATES active weight because attention + embeddings are
36
+ always active (not just experts). For a more accurate number, use the
37
+ model card's stated "total / active" figure if available. The
38
+ "active-only" throughput is labeled "optimistic" for this reason.
39
+ """
40
+
41
+ from __future__ import annotations
42
+
43
+ from dataclasses import dataclass
44
+
45
+ from llm_cal.architecture.profile import ArchitectureProfile
46
+ from llm_cal.hardware.loader import GPUSpec
47
+ from llm_cal.output.labels import AnnotatedValue, Label
48
+
49
+ # Empirical defaults. All user-overridable via CLI.
50
+ DEFAULT_PREFILL_UTILIZATION = 0.40
51
+ DEFAULT_DECODE_BW_UTILIZATION = 0.50
52
+ DEFAULT_CLUSTER_COMM_EFFICIENCY = 0.90
53
+ # Honest baseline. Previously 1.5, borrowed from an LLM-generated report —
54
+ # that had no primary source, so we reset to 1.0. Users who observe actual
55
+ # degradation on their engine should dial this up via CLI.
56
+ DEFAULT_CONCURRENCY_DEGRADATION = 1.0
57
+
58
+
59
+ @dataclass(frozen=True)
60
+ class PrefillEstimate:
61
+ total_flops: AnnotatedValue[int] # [estimated] 2 * params * input_tokens
62
+ peak_effective_tflops: AnnotatedValue[float] # TFLOPS × utilization
63
+ latency_ms: AnnotatedValue[float]
64
+ utilization: float # the factor used (for provenance)
65
+
66
+
67
+ @dataclass(frozen=True)
68
+ class DecodeEstimate:
69
+ active_weight_bytes_per_gpu: AnnotatedValue[int]
70
+ per_gpu_tokens_per_sec: AnnotatedValue[float]
71
+ cluster_tokens_per_sec: AnnotatedValue[float] # after comm efficiency
72
+ bw_utilization: float
73
+ cluster_comm_efficiency: float
74
+ moe_active_weight_bytes_per_gpu: AnnotatedValue[int] | None = None
75
+ moe_active_tokens_per_sec: AnnotatedValue[float] | None = None
76
+
77
+
78
+ def estimate_prefill(
79
+ profile: ArchitectureProfile,
80
+ total_params: int,
81
+ gpu: GPUSpec,
82
+ num_gpus: int,
83
+ input_tokens: int,
84
+ utilization: float = DEFAULT_PREFILL_UTILIZATION,
85
+ ) -> PrefillEstimate:
86
+ """Estimate single-request prefill latency.
87
+
88
+ Based on compute: FLOPs = 2 × params × tokens; latency = FLOPs / effective_FLOPS.
89
+ """
90
+ flops = 2 * total_params * input_tokens
91
+ # TP distributes compute, so aggregate TFLOPS = num_gpus × per-card × util
92
+ aggregate_tflops = gpu.fp16_tflops * num_gpus * utilization
93
+ # Guard against zero
94
+ if aggregate_tflops <= 0 or total_params <= 0 or input_tokens <= 0:
95
+ return PrefillEstimate(
96
+ total_flops=AnnotatedValue(0, Label.UNKNOWN, source="insufficient inputs"),
97
+ peak_effective_tflops=AnnotatedValue(0.0, Label.UNKNOWN),
98
+ latency_ms=AnnotatedValue(0.0, Label.UNKNOWN),
99
+ utilization=utilization,
100
+ )
101
+ latency_s = flops / (aggregate_tflops * 1e12)
102
+ latency_ms = latency_s * 1000.0
103
+
104
+ return PrefillEstimate(
105
+ total_flops=AnnotatedValue(
106
+ flops,
107
+ Label.ESTIMATED,
108
+ source=f"2 × {total_params:,} params × {input_tokens:,} tokens",
109
+ ),
110
+ peak_effective_tflops=AnnotatedValue(
111
+ aggregate_tflops,
112
+ Label.ESTIMATED,
113
+ source=f"{gpu.fp16_tflops} × {num_gpus} GPUs × {utilization:.0%} util",
114
+ ),
115
+ latency_ms=AnnotatedValue(
116
+ latency_ms,
117
+ Label.ESTIMATED,
118
+ source=(f"{flops:.2e} FLOPs / ({aggregate_tflops:.1f} effective TFLOPS × 1e12)"),
119
+ ),
120
+ utilization=utilization,
121
+ )
122
+
123
+
124
+ def _nvlink_efficiency(gpu: GPUSpec, num_gpus: int) -> float:
125
+ """Multiplier on cluster comm efficiency reflecting NVLink bandwidth.
126
+
127
+ Single-GPU has no TP all-reduce, so no penalty. H100 / B200 / H200 / A100-
128
+ SXM4 with full NVLink (>=900 GB/s aggregate, dropped to 600 for A100) get
129
+ ~1.0. Restricted-NVLink variants (H800: 400 GB/s, half of H100) pay ~8%.
130
+ PCIe-only cards (L40S, RTX) with no NVLink pay 20%.
131
+ """
132
+ if num_gpus <= 1:
133
+ return 1.0
134
+ nvlink = gpu.nvlink_bandwidth_gbps or 0
135
+ if nvlink >= 900:
136
+ return 1.0
137
+ if nvlink <= 0:
138
+ return 0.80
139
+ return 0.85 + 0.15 * (nvlink / 900.0)
140
+
141
+
142
+ def estimate_decode(
143
+ profile: ArchitectureProfile,
144
+ total_weight_bytes: int,
145
+ gpu: GPUSpec,
146
+ num_gpus: int,
147
+ bw_utilization: float = DEFAULT_DECODE_BW_UTILIZATION,
148
+ cluster_comm_efficiency: float = DEFAULT_CLUSTER_COMM_EFFICIENCY,
149
+ moe_active_params_ratio: float | None = None,
150
+ ) -> DecodeEstimate:
151
+ """Estimate decode tokens/second.
152
+
153
+ Decode is memory-bandwidth-bound: per-token time = weight_bytes / bw.
154
+ Under TP, weights split across ranks, so per-GPU weight bytes = total / N.
155
+
156
+ If the model is MoE and moe_active_params_ratio is given (e.g. 0.3 for
157
+ active/total), we ALSO report an optimistic "active only" throughput.
158
+ """
159
+ if gpu.memory_bandwidth_gbps is None or gpu.memory_bandwidth_gbps <= 0:
160
+ _unknown = AnnotatedValue(
161
+ 0, Label.UNKNOWN, source="GPU memory_bandwidth_gbps not in database"
162
+ )
163
+ _unknown_f = AnnotatedValue(
164
+ 0.0, Label.UNKNOWN, source="GPU memory_bandwidth_gbps not in database"
165
+ )
166
+ return DecodeEstimate(
167
+ active_weight_bytes_per_gpu=_unknown,
168
+ per_gpu_tokens_per_sec=_unknown_f,
169
+ cluster_tokens_per_sec=_unknown_f,
170
+ bw_utilization=bw_utilization,
171
+ cluster_comm_efficiency=cluster_comm_efficiency,
172
+ )
173
+
174
+ bw_bytes_per_s = gpu.memory_bandwidth_gbps * 1e9 # GB/s → bytes/s
175
+ effective_bw = bw_bytes_per_s * bw_utilization
176
+ weight_per_gpu = max(1, total_weight_bytes // num_gpus)
177
+ per_gpu_tps = effective_bw / weight_per_gpu
178
+ # Cluster-level: per-GPU × N × comm_efficiency × NVLink-aware penalty.
179
+ # NVLink penalty captures TP all-reduce overhead on cards with restricted
180
+ # interconnect (H800, PCIe-only). Single-GPU is unaffected.
181
+ nvlink_eff = _nvlink_efficiency(gpu, num_gpus)
182
+ effective_comm_eff = cluster_comm_efficiency * nvlink_eff
183
+ cluster_tps = per_gpu_tps * num_gpus * effective_comm_eff
184
+
185
+ # MoE active-only optimistic view
186
+ moe_active_weight: AnnotatedValue[int] | None = None
187
+ moe_active_tps: AnnotatedValue[float] | None = None
188
+ if profile.is_moe and moe_active_params_ratio is not None and moe_active_params_ratio > 0:
189
+ active_bytes = int(weight_per_gpu * moe_active_params_ratio)
190
+ moe_active_weight = AnnotatedValue(
191
+ active_bytes,
192
+ Label.ESTIMATED,
193
+ source=f"{weight_per_gpu:,} × {moe_active_params_ratio:.3f} (active/total ratio)",
194
+ )
195
+ if active_bytes > 0:
196
+ active_per_gpu_tps = effective_bw / active_bytes
197
+ active_cluster_tps = active_per_gpu_tps * num_gpus * effective_comm_eff
198
+ moe_active_tps = AnnotatedValue(
199
+ active_cluster_tps,
200
+ Label.ESTIMATED,
201
+ source=(
202
+ f"optimistic MoE active-only: effective_bw / {active_bytes:,} × "
203
+ f"{num_gpus} × {effective_comm_eff:.3f}"
204
+ ),
205
+ )
206
+
207
+ return DecodeEstimate(
208
+ active_weight_bytes_per_gpu=AnnotatedValue(
209
+ weight_per_gpu,
210
+ Label.ESTIMATED,
211
+ source=f"{total_weight_bytes:,} bytes / {num_gpus} TP ranks",
212
+ ),
213
+ per_gpu_tokens_per_sec=AnnotatedValue(
214
+ per_gpu_tps,
215
+ Label.ESTIMATED,
216
+ source=(
217
+ f"{gpu.memory_bandwidth_gbps} GB/s × {bw_utilization:.0%} util / "
218
+ f"{weight_per_gpu:,} weight bytes"
219
+ ),
220
+ ),
221
+ cluster_tokens_per_sec=AnnotatedValue(
222
+ cluster_tps,
223
+ Label.ESTIMATED,
224
+ source=(
225
+ f"per-GPU × {num_gpus} GPUs × {cluster_comm_efficiency:.0%} comm × "
226
+ f"{nvlink_eff:.3f} NVLink penalty (NVLink={gpu.nvlink_bandwidth_gbps or 0} GB/s)"
227
+ ),
228
+ ),
229
+ bw_utilization=bw_utilization,
230
+ cluster_comm_efficiency=cluster_comm_efficiency,
231
+ moe_active_weight_bytes_per_gpu=moe_active_weight,
232
+ moe_active_tokens_per_sec=moe_active_tps,
233
+ )
src/llm_cal/performance/concurrency.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Dual-bound concurrency analysis + bottleneck classification.
2
+
3
+ Models two concurrency ceilings:
4
+ K = memory-capacity bound
5
+ (usable GPU memory ÷ per-request KV cache)
6
+ L = compute/bandwidth bound at a given SLA
7
+ (cluster decode throughput ÷ target per-user tokens/sec ÷ degradation)
8
+
9
+ Max concurrent = min(K, L). Whichever is smaller names the bottleneck.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import math
15
+ from dataclasses import dataclass
16
+ from typing import Literal
17
+
18
+ from llm_cal.output.labels import AnnotatedValue, Label
19
+ from llm_cal.performance.compute import (
20
+ DEFAULT_CONCURRENCY_DEGRADATION,
21
+ DecodeEstimate,
22
+ )
23
+
24
+ Bottleneck = Literal[
25
+ "memory_capacity",
26
+ "memory_bandwidth",
27
+ "compute",
28
+ "insufficient_data",
29
+ ]
30
+
31
+
32
+ @dataclass(frozen=True)
33
+ class ConcurrencyAnalysis:
34
+ # K bound
35
+ k_bound: AnnotatedValue[int]
36
+ k_source_headroom_bytes: int
37
+ k_source_kv_per_req_bytes: int
38
+ # L bound
39
+ l_bound: AnnotatedValue[int]
40
+ target_tokens_per_sec: float
41
+ degradation_factor: float
42
+ # Verdict
43
+ max_concurrent: AnnotatedValue[int]
44
+ bottleneck: Bottleneck
45
+ bottleneck_reason_en: str
46
+ bottleneck_reason_zh: str
47
+
48
+
49
+ def analyze(
50
+ *,
51
+ cluster_headroom_bytes: int, # total KV headroom across all GPUs at ref context
52
+ kv_bytes_per_request: int, # single-request KV cache at ref context
53
+ decode: DecodeEstimate,
54
+ target_tokens_per_sec: float,
55
+ degradation: float = DEFAULT_CONCURRENCY_DEGRADATION,
56
+ ) -> ConcurrencyAnalysis:
57
+ """Compute K and L bounds and pick the tighter one.
58
+
59
+ `cluster_headroom_bytes` and `kv_bytes_per_request` should be pre-adjusted
60
+ for TP sharding (see fleet planner for the same rule).
61
+ """
62
+ # K: how many requests fit in KV memory
63
+ if kv_bytes_per_request <= 0:
64
+ k = 0
65
+ k_label = Label.UNKNOWN
66
+ k_source = "KV cache per request is zero or unknown"
67
+ else:
68
+ k = max(0, math.floor(cluster_headroom_bytes / kv_bytes_per_request))
69
+ k_label = Label.ESTIMATED
70
+ k_source = (
71
+ f"{cluster_headroom_bytes:,} bytes headroom / "
72
+ f"{kv_bytes_per_request:,} bytes per request"
73
+ )
74
+
75
+ # L: how many concurrent users can maintain target tokens/sec
76
+ cluster_tps = decode.cluster_tokens_per_sec.value
77
+ if cluster_tps <= 0 or target_tokens_per_sec <= 0 or degradation <= 0:
78
+ l_bound = 0
79
+ l_label = Label.UNKNOWN
80
+ l_source = "cluster throughput or target is zero / unknown"
81
+ else:
82
+ l_bound = max(0, math.floor(cluster_tps / target_tokens_per_sec / degradation))
83
+ l_label = Label.ESTIMATED
84
+ l_source = (
85
+ f"{cluster_tps:.1f} tok/s cluster / "
86
+ f"{target_tokens_per_sec:.1f} target / {degradation:.2f} degradation"
87
+ )
88
+
89
+ # Pick the tighter bound
90
+ if k == 0 and l_bound == 0:
91
+ max_n = 0
92
+ bottleneck: Bottleneck = "insufficient_data"
93
+ reason_en = "Both K and L unknown — cannot conclude."
94
+ reason_zh = "K 和 L 均未知,无法得出结论。"
95
+ elif k <= l_bound:
96
+ max_n = k
97
+ bottleneck = "memory_capacity"
98
+ reason_en = (
99
+ f"K ({k}) ≤ L ({l_bound}) → memory-capacity bound. "
100
+ "KV cache exhausts GPU headroom before throughput SLA does."
101
+ )
102
+ reason_zh = (
103
+ f"K ({k}) ≤ L ({l_bound}) → 显存容量瓶颈。先达到 KV cache 容量上限,才到吞吐目标。"
104
+ )
105
+ else:
106
+ max_n = l_bound
107
+ # Whether it's "compute" or "bandwidth" depends on where decode is bound.
108
+ # For v0.1 we just say "memory bandwidth / compute" since decode is
109
+ # bw-bound by default and the two share the same formula output.
110
+ bottleneck = "memory_bandwidth"
111
+ reason_en = (
112
+ f"L ({l_bound}) < K ({k}) → memory-bandwidth / compute bound. "
113
+ "Cluster can't sustain target tok/s per user at this concurrency."
114
+ )
115
+ reason_zh = f"L ({l_bound}) < K ({k}) → 带宽/算力瓶颈。集群在此并发下无法维持目标 tok/s。"
116
+
117
+ return ConcurrencyAnalysis(
118
+ k_bound=AnnotatedValue(k, k_label, source=k_source),
119
+ k_source_headroom_bytes=cluster_headroom_bytes,
120
+ k_source_kv_per_req_bytes=kv_bytes_per_request,
121
+ l_bound=AnnotatedValue(l_bound, l_label, source=l_source),
122
+ target_tokens_per_sec=target_tokens_per_sec,
123
+ degradation_factor=degradation,
124
+ max_concurrent=AnnotatedValue(
125
+ max_n,
126
+ Label.ESTIMATED if max_n > 0 else Label.UNKNOWN,
127
+ source=f"min(K={k}, L={l_bound})",
128
+ ),
129
+ bottleneck=bottleneck,
130
+ bottleneck_reason_en=reason_en,
131
+ bottleneck_reason_zh=reason_zh,
132
+ )
src/llm_cal/weight_analyzer/__init__.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Weight analyzer — observed bytes + inferred quantization scheme.
2
+
3
+ Rules:
4
+ - `[verified]` — directly from HF/ModelScope API (sum of siblings[].size). Nothing else.
5
+ - `[inferred]` — any derivation, including bits/param and quantization guess.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass
11
+ from typing import TYPE_CHECKING, Literal
12
+
13
+ from llm_cal.model_source.base import SiblingFile
14
+ from llm_cal.output.labels import AnnotatedValue, Label
15
+
16
+ if TYPE_CHECKING:
17
+ from llm_cal.weight_analyzer.fingerprint import QuantFingerprint
18
+
19
+ # Known byte-per-param values. bits/param = bpp * 8.
20
+ QuantizationScheme = Literal[
21
+ "FP16",
22
+ "BF16",
23
+ "FP8",
24
+ "INT8",
25
+ "FP4_FP8_MIXED", # DeepSeek-V4-Flash style
26
+ "INT4",
27
+ "GPTQ_INT4",
28
+ "AWQ_INT4",
29
+ "UNKNOWN",
30
+ ]
31
+
32
+ # Rough bytes-per-param anchor points. Used by reconciler.
33
+ _QUANT_BPP: dict[QuantizationScheme, float] = {
34
+ "FP16": 2.00,
35
+ "BF16": 2.00,
36
+ "FP8": 1.00,
37
+ "INT8": 1.00,
38
+ "FP4_FP8_MIXED": 0.55, # DeepSeek V4 empirical (~4.5 bits/param)
39
+ "INT4": 0.50,
40
+ "GPTQ_INT4": 0.55, # +scale tensors overhead
41
+ "AWQ_INT4": 0.55,
42
+ "UNKNOWN": 0.0,
43
+ }
44
+
45
+
46
+ @dataclass(frozen=True)
47
+ class WeightReport:
48
+ """Everything the weight analyzer can determine from files + params."""
49
+
50
+ total_bytes: AnnotatedValue[int] # [verified]
51
+ bits_per_param: AnnotatedValue[float] | None # [inferred]
52
+ quantization_guess: AnnotatedValue[QuantizationScheme] # [inferred]
53
+
54
+
55
+ def _safetensors_total_bytes(siblings: tuple[SiblingFile, ...]) -> int:
56
+ """Sum all *.safetensors file sizes. Ignores config, tokenizer, etc."""
57
+ return sum((s.size or 0) for s in siblings if s.filename.endswith(".safetensors"))
58
+
59
+
60
+ def analyze(
61
+ siblings: tuple[SiblingFile, ...],
62
+ total_params: int | None,
63
+ fingerprint: QuantFingerprint | None = None,
64
+ ) -> WeightReport:
65
+ """Compute weight report from sibling files + param count.
66
+
67
+ `total_params` comes from summing across the architecture (computed elsewhere)
68
+ or is None if we couldn't determine it — in which case we skip the inference
69
+ step and return raw file size only.
70
+
71
+ `fingerprint` (optional) is authoritative evidence from config.json or
72
+ safetensors header. When present, it overrides the bpp nearest-match
73
+ heuristic for quantization_guess (VERIFIED instead of INFERRED).
74
+ """
75
+ observed_bytes = _safetensors_total_bytes(siblings)
76
+ total_bytes = AnnotatedValue(
77
+ observed_bytes,
78
+ Label.VERIFIED,
79
+ source="sum of safetensors siblings from model_info API",
80
+ )
81
+
82
+ if not total_params or observed_bytes == 0:
83
+ return WeightReport(
84
+ total_bytes=total_bytes,
85
+ bits_per_param=None,
86
+ quantization_guess=AnnotatedValue(
87
+ "UNKNOWN",
88
+ Label.UNKNOWN,
89
+ source="total_params unknown or no safetensors files",
90
+ ),
91
+ )
92
+
93
+ bpp = observed_bytes / total_params
94
+ bits_per_param = AnnotatedValue(
95
+ bpp * 8,
96
+ Label.INFERRED,
97
+ source=f"{observed_bytes} bytes / {total_params} params",
98
+ )
99
+
100
+ if fingerprint is not None:
101
+ quant: AnnotatedValue[QuantizationScheme] = AnnotatedValue(
102
+ fingerprint.scheme,
103
+ Label.VERIFIED,
104
+ source=fingerprint.evidence,
105
+ )
106
+ else:
107
+ quant = _guess_quantization(bpp)
108
+
109
+ return WeightReport(
110
+ total_bytes=total_bytes,
111
+ bits_per_param=bits_per_param,
112
+ quantization_guess=quant,
113
+ )
114
+
115
+
116
+ def _guess_quantization(bpp: float) -> AnnotatedValue[QuantizationScheme]:
117
+ """Nearest-match heuristic.
118
+
119
+ Tolerance ±0.10 bits/param for mixed-precision schemes (scale tensors,
120
+ FP16 embeddings, etc.); ±0.05 for pure schemes. See Success Criteria #2.
121
+ """
122
+ # Ordered so closest anchor wins on ties
123
+ candidates: list[tuple[QuantizationScheme, float, float]] = [
124
+ ("FP16", _QUANT_BPP["FP16"], 0.05),
125
+ ("FP8", _QUANT_BPP["FP8"], 0.05),
126
+ ("FP4_FP8_MIXED", _QUANT_BPP["FP4_FP8_MIXED"], 0.10),
127
+ ("INT4", _QUANT_BPP["INT4"], 0.05),
128
+ ("GPTQ_INT4", _QUANT_BPP["GPTQ_INT4"], 0.10),
129
+ ]
130
+ best: tuple[QuantizationScheme, float] | None = None
131
+ for scheme, anchor_bpp, tolerance in candidates:
132
+ delta = abs(bpp - anchor_bpp)
133
+ if delta <= tolerance and (best is None or delta < best[1]):
134
+ best = (scheme, delta)
135
+
136
+ if best is None:
137
+ return AnnotatedValue(
138
+ "UNKNOWN",
139
+ Label.UNKNOWN,
140
+ source=f"bits/param {bpp * 8:.2f} does not match known schemes",
141
+ )
142
+ return AnnotatedValue(
143
+ best[0],
144
+ Label.INFERRED,
145
+ source=f"bits/param {bpp * 8:.2f} within tolerance of {best[0]}",
146
+ )
src/llm_cal/weight_analyzer/fingerprint.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Quantization fingerprinting — tie-breakers for the reconciler.
2
+
3
+ When `reconciler.reconcile` has multiple schemes tied at the same bits/param
4
+ (FP4_FP8_MIXED, GPTQ_INT4, and AWQ_INT4 all sit at bpp=0.55), bytes alone
5
+ cannot pick a winner. We resolve the ambiguity with two stronger signals:
6
+
7
+ 1. `quantization_config` in config.json — explicit declaration by the model
8
+ author. Covers most GPTQ/AWQ/FP8 community uploads.
9
+
10
+ 2. safetensors per-tensor dtype + tensor-name patterns — the ground truth.
11
+ Covers models like DeepSeek-V4-Flash that use custom mixed-precision
12
+ packs without a config.json declaration.
13
+
14
+ Both return a `QuantFingerprint`. The reconciler uses the fingerprint's
15
+ `scheme` as a tie-breaker, and the `evidence` string flows into the
16
+ derivation trace.
17
+
18
+ This module is pure — no network, no file I/O. `safetensors_reader.py`
19
+ handles fetching; this module interprets what was fetched.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ from dataclasses import dataclass
25
+ from typing import Any, Literal
26
+
27
+ from llm_cal.weight_analyzer import QuantizationScheme
28
+
29
+ SourceType = Literal["config_json", "safetensors_header"]
30
+
31
+
32
+ @dataclass(frozen=True)
33
+ class QuantFingerprint:
34
+ scheme: QuantizationScheme
35
+ source_type: SourceType
36
+ evidence: str # for the derivation trace
37
+
38
+
39
+ # ---------------------------------------------------------------------------
40
+ # Config.json: explicit quant_method declaration
41
+
42
+
43
+ def from_config(config: dict[str, Any]) -> QuantFingerprint | None:
44
+ """Read `config.json` `quantization_config` and map to a scheme.
45
+
46
+ Returns None if no `quantization_config` block exists (model either
47
+ unquantized in-config or uses a per-tensor pack without declaration).
48
+ """
49
+ qc = config.get("quantization_config")
50
+ if not isinstance(qc, dict):
51
+ return None
52
+
53
+ quant_method = qc.get("quant_method")
54
+ bits = qc.get("bits")
55
+ weight_dtype = qc.get("weight_dtype")
56
+
57
+ # GPTQ family
58
+ if quant_method == "gptq":
59
+ if bits == 4:
60
+ return QuantFingerprint(
61
+ scheme="GPTQ_INT4",
62
+ source_type="config_json",
63
+ evidence="config.json quantization_config.quant_method=gptq, bits=4",
64
+ )
65
+ if bits == 8:
66
+ return QuantFingerprint(
67
+ scheme="INT8",
68
+ source_type="config_json",
69
+ evidence="config.json quantization_config.quant_method=gptq, bits=8",
70
+ )
71
+
72
+ # AWQ family
73
+ if quant_method == "awq" and bits == 4:
74
+ return QuantFingerprint(
75
+ scheme="AWQ_INT4",
76
+ source_type="config_json",
77
+ evidence="config.json quantization_config.quant_method=awq, bits=4",
78
+ )
79
+
80
+ # FP8 (native or compressed-tensors wrapping)
81
+ if quant_method == "fp8":
82
+ return QuantFingerprint(
83
+ scheme="FP8",
84
+ source_type="config_json",
85
+ evidence="config.json quantization_config.quant_method=fp8",
86
+ )
87
+
88
+ # compressed-tensors (RedHatAI etc.) — inspect inner weight dtype
89
+ if quant_method == "compressed-tensors":
90
+ # The config_groups.group_0.weights.type can be "float", "int", etc.
91
+ # and num_bits gives 4/8. For v0.1.2 we handle the two common cases.
92
+ groups = qc.get("config_groups") or {}
93
+ # Pick the first group; schemas with heterogeneous groups degrade
94
+ # gracefully to None (reconciler stays in tied state).
95
+ for g in groups.values():
96
+ if not isinstance(g, dict):
97
+ continue
98
+ weights = g.get("weights") or {}
99
+ num_bits = weights.get("num_bits")
100
+ wtype = weights.get("type")
101
+ if num_bits == 8 and wtype in ("float", "fp8"):
102
+ return QuantFingerprint(
103
+ scheme="FP8",
104
+ source_type="config_json",
105
+ evidence="config.json compressed-tensors group weights=fp8/8bit",
106
+ )
107
+ if num_bits == 8 and wtype == "int":
108
+ return QuantFingerprint(
109
+ scheme="INT8",
110
+ source_type="config_json",
111
+ evidence="config.json compressed-tensors group weights=int/8bit",
112
+ )
113
+ if num_bits == 4 and wtype == "int":
114
+ # Generic INT4 — don't claim GPTQ or AWQ without more evidence
115
+ return QuantFingerprint(
116
+ scheme="INT4",
117
+ source_type="config_json",
118
+ evidence="config.json compressed-tensors group weights=int/4bit",
119
+ )
120
+ break # first group only
121
+
122
+ # bitsandbytes — load_in_4bit / load_in_8bit flags
123
+ if quant_method == "bitsandbytes":
124
+ if qc.get("load_in_4bit"):
125
+ return QuantFingerprint(
126
+ scheme="INT4",
127
+ source_type="config_json",
128
+ evidence="config.json quant_method=bitsandbytes, load_in_4bit=true",
129
+ )
130
+ if qc.get("load_in_8bit"):
131
+ return QuantFingerprint(
132
+ scheme="INT8",
133
+ source_type="config_json",
134
+ evidence="config.json quant_method=bitsandbytes, load_in_8bit=true",
135
+ )
136
+
137
+ # Standalone weight_dtype (no nested groups — some custom loaders)
138
+ if weight_dtype in ("float8_e4m3fn", "float8_e5m2"):
139
+ return QuantFingerprint(
140
+ scheme="FP8",
141
+ source_type="config_json",
142
+ evidence=f"config.json quantization_config.weight_dtype={weight_dtype}",
143
+ )
144
+
145
+ return None
146
+
147
+
148
+ # ---------------------------------------------------------------------------
149
+ # Safetensors header: per-tensor dtype + tensor-name patterns
150
+
151
+ # safetensors dtype strings (from the format spec)
152
+ _FP8_DTYPES = frozenset({"F8_E4M3", "F8_E5M2"})
153
+ _FP4_DTYPES = frozenset({"F4_E2M1", "F4"}) # F4 is used by some toolchains
154
+ _FP16_DTYPES = frozenset({"F16"})
155
+ _BF16_DTYPES = frozenset({"BF16"})
156
+ _INT8_DTYPES = frozenset({"I8", "U8"})
157
+ # F8_E8M0 is the 8-bit shared-exponent scaling factor used by MX-format
158
+ # block-scaled quantization (MXFP4, MXFP8). Its presence alongside packed
159
+ # integer weights (I8) is the signature of FP4 weight packing.
160
+ _MX_SCALE_DTYPES = frozenset({"F8_E8M0"})
161
+
162
+
163
+ def from_safetensors_dtypes(tensor_dtypes: dict[str, str]) -> QuantFingerprint | None:
164
+ """Fingerprint from a parsed safetensors header (tensor_name -> dtype string).
165
+
166
+ Only considers "weight-like" tensors. Non-weight tensors (norms, biases,
167
+ embeddings, LayerNorm params) often stay in FP16/BF16 even in heavily
168
+ quantized models, so counting them directly would give a wrong picture.
169
+ """
170
+ if not tensor_dtypes:
171
+ return None
172
+
173
+ names = set(tensor_dtypes.keys())
174
+
175
+ # ------------------------------------------------------------------
176
+ # Packed-int4 schemes first — they have distinctive tensor-name markers
177
+ # even though the underlying dtype is I32 (bit-packed).
178
+
179
+ has_qweight = any(n.endswith(".qweight") or n.endswith("_qweight") for n in names)
180
+ has_g_idx = any(n.endswith(".g_idx") or n.endswith("_g_idx") for n in names)
181
+ has_qzeros = any(n.endswith(".qzeros") or n.endswith("_qzeros") for n in names)
182
+
183
+ if has_qweight and has_g_idx:
184
+ return QuantFingerprint(
185
+ scheme="GPTQ_INT4",
186
+ source_type="safetensors_header",
187
+ evidence="safetensors header has .qweight + .g_idx tensors (GPTQ marker)",
188
+ )
189
+ if has_qweight and has_qzeros and not has_g_idx:
190
+ return QuantFingerprint(
191
+ scheme="AWQ_INT4",
192
+ source_type="safetensors_header",
193
+ evidence="safetensors header has .qweight + .qzeros, no .g_idx (AWQ marker)",
194
+ )
195
+
196
+ # ------------------------------------------------------------------
197
+ # Dtype histogram over weight-like tensors.
198
+ # Skip norms / biases / embeddings which typically don't get quantized.
199
+
200
+ def _is_weight_tensor(name: str) -> bool:
201
+ lname = name.lower()
202
+ if any(sub in lname for sub in (".norm", ".bias", "embed", "lm_head")):
203
+ return False
204
+ # Tensor names in transformer models usually contain "weight"
205
+ return "weight" in lname or lname.endswith(".w") or lname.endswith(".proj")
206
+
207
+ weight_dtypes: list[str] = [dt for n, dt in tensor_dtypes.items() if _is_weight_tensor(n)]
208
+ if not weight_dtypes:
209
+ # Fall back to all dtypes if the name heuristic found nothing
210
+ weight_dtypes = list(tensor_dtypes.values())
211
+
212
+ has_fp4 = any(dt in _FP4_DTYPES for dt in weight_dtypes)
213
+ has_fp8 = any(dt in _FP8_DTYPES for dt in weight_dtypes)
214
+ has_fp16 = any(dt in _FP16_DTYPES for dt in weight_dtypes)
215
+ has_bf16 = any(dt in _BF16_DTYPES for dt in weight_dtypes)
216
+ has_int8 = any(dt in _INT8_DTYPES for dt in weight_dtypes)
217
+ has_mx_scale = any(dt in _MX_SCALE_DTYPES for dt in tensor_dtypes.values())
218
+
219
+ # MX-format block-scaled quantization (DeepSeek-V4-Flash pattern):
220
+ # F8_E8M0 scale tensors + packed I8 weights, plus a layer of F8_E4M3 for
221
+ # the FP8 sub-pack. Detected via the scale-dtype signature.
222
+ if has_mx_scale and has_int8:
223
+ if has_fp8:
224
+ return QuantFingerprint(
225
+ scheme="FP4_FP8_MIXED",
226
+ source_type="safetensors_header",
227
+ evidence=(
228
+ f"safetensors header: F8_E8M0 scale tensors + "
229
+ f"{sum(dt in _INT8_DTYPES for dt in weight_dtypes)} packed-I8 "
230
+ f"(FP4) weights + "
231
+ f"{sum(dt in _FP8_DTYPES for dt in weight_dtypes)} FP8 weights — "
232
+ f"MX block-scaled mixed pack"
233
+ ),
234
+ )
235
+ # MXFP4 only — nominally INT4 but with the MX scaling envelope
236
+ return QuantFingerprint(
237
+ scheme="FP4_FP8_MIXED", # closest existing scheme; bpp ≈ 0.55 anchor
238
+ source_type="safetensors_header",
239
+ evidence=(
240
+ f"safetensors header: F8_E8M0 scale tensors + "
241
+ f"{sum(dt in _INT8_DTYPES for dt in weight_dtypes)} packed-I8 "
242
+ f"(FP4) weights — MXFP4 block-scaled"
243
+ ),
244
+ )
245
+
246
+ # Classic FP4 + FP8 mixed (older toolchains exposing F4 dtype directly)
247
+ if has_fp4 and has_fp8:
248
+ return QuantFingerprint(
249
+ scheme="FP4_FP8_MIXED",
250
+ source_type="safetensors_header",
251
+ evidence=(
252
+ f"safetensors header has both FP4 and FP8 weight tensors "
253
+ f"({sum(dt in _FP4_DTYPES for dt in weight_dtypes)} FP4, "
254
+ f"{sum(dt in _FP8_DTYPES for dt in weight_dtypes)} FP8)"
255
+ ),
256
+ )
257
+
258
+ # Pure FP8 — every weight tensor is F8_E4M3 or F8_E5M2
259
+ if has_fp8 and not (has_fp4 or has_int8):
260
+ fp8_count = sum(dt in _FP8_DTYPES for dt in weight_dtypes)
261
+ return QuantFingerprint(
262
+ scheme="FP8",
263
+ source_type="safetensors_header",
264
+ evidence=f"safetensors header: {fp8_count}/{len(weight_dtypes)} weight tensors are FP8",
265
+ )
266
+
267
+ # Pure FP16
268
+ if has_fp16 and not (has_fp8 or has_fp4 or has_int8 or has_bf16):
269
+ return QuantFingerprint(
270
+ scheme="FP16",
271
+ source_type="safetensors_header",
272
+ evidence=f"safetensors header: all {len(weight_dtypes)} weight tensors are F16",
273
+ )
274
+
275
+ # Pure BF16
276
+ if has_bf16 and not (has_fp8 or has_fp4 or has_int8 or has_fp16):
277
+ return QuantFingerprint(
278
+ scheme="BF16",
279
+ source_type="safetensors_header",
280
+ evidence=f"safetensors header: all {len(weight_dtypes)} weight tensors are BF16",
281
+ )
282
+
283
+ # Pure INT8
284
+ if has_int8 and not (has_fp8 or has_fp4 or has_fp16 or has_bf16):
285
+ return QuantFingerprint(
286
+ scheme="INT8",
287
+ source_type="safetensors_header",
288
+ evidence=f"safetensors header: {len(weight_dtypes)} weight tensors are INT8",
289
+ )
290
+
291
+ # Mixed in a way we don't have a named scheme for — stay silent
292
+ return None
src/llm_cal/weight_analyzer/reconciler.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Reconciler — compare observed weight bytes vs computed under each quantization assumption.
2
+
3
+ This is the module that outputs the DeepSeek-V4-Flash story (Problem Evidence in design doc):
4
+ "gpu_poor says 285 GB (assumes pure FP8); we say 160 GB (observed bytes match FP4+FP8
5
+ pack hypothesis). Here's why."
6
+
7
+ Core value: makes the quantization inference step transparent. The user sees all
8
+ candidates considered, not just the winner.
9
+
10
+ When multiple schemes share the same bytes-per-param anchor (FP4_FP8_MIXED,
11
+ GPTQ_INT4, and AWQ_INT4 all sit at bpp=0.55), bytes alone cannot pick a winner.
12
+ Pass a `QuantFingerprint` from `fingerprint.from_config()` or
13
+ `fingerprint.from_safetensors_dtypes()` to break the tie with authoritative
14
+ evidence.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ from dataclasses import dataclass
20
+
21
+ from llm_cal.output.labels import AnnotatedValue, Label
22
+ from llm_cal.weight_analyzer import _QUANT_BPP, QuantizationScheme
23
+ from llm_cal.weight_analyzer.fingerprint import QuantFingerprint
24
+
25
+
26
+ @dataclass(frozen=True)
27
+ class ReconciliationCandidate:
28
+ scheme: QuantizationScheme
29
+ predicted_bytes: int
30
+ delta_bytes: int # observed - predicted (positive = observed is larger)
31
+ relative_error: float # |delta| / predicted
32
+
33
+
34
+ @dataclass(frozen=True)
35
+ class ReconciliationReport:
36
+ observed_bytes: int
37
+ total_params: int
38
+ candidates: tuple[ReconciliationCandidate, ...] # sorted by |relative_error| asc
39
+ best: AnnotatedValue[QuantizationScheme]
40
+
41
+ def summary_line(self) -> str:
42
+ """One-liner for output formatter."""
43
+ if not self.candidates:
44
+ return f"{self.observed_bytes:,} bytes — no quantization candidates tested"
45
+ c = self.candidates[0]
46
+ return (
47
+ f"Observed {self.observed_bytes:,} bytes. "
48
+ f"Best match: {c.scheme} "
49
+ f"(predicts {c.predicted_bytes:,} bytes, "
50
+ f"{c.relative_error * 100:.1f}% error)"
51
+ )
52
+
53
+
54
+ # Tolerance for tie detection — schemes within this relative-error delta of the
55
+ # winner are considered tied.
56
+ _TIE_THRESHOLD = 0.01
57
+
58
+ # Tolerance gate — if the closest candidate is off by more than this, call UNKNOWN.
59
+ _UNKNOWN_THRESHOLD = 0.15
60
+
61
+
62
+ def reconcile(
63
+ observed_bytes: int,
64
+ total_params: int,
65
+ fingerprint: QuantFingerprint | None = None,
66
+ ) -> ReconciliationReport:
67
+ """Compare observed file bytes against every known quantization scheme.
68
+
69
+ Args:
70
+ observed_bytes: Sum of safetensors file sizes.
71
+ total_params: Estimated param count.
72
+ fingerprint: Optional authoritative evidence from config.json or
73
+ safetensors header. Breaks bpp ties and annotates the source.
74
+
75
+ Returns full ranking so the formatter can show "gpu_poor would say X; we say Y."
76
+ """
77
+ if observed_bytes == 0 or total_params == 0:
78
+ return ReconciliationReport(
79
+ observed_bytes=observed_bytes,
80
+ total_params=total_params,
81
+ candidates=(),
82
+ best=AnnotatedValue(
83
+ "UNKNOWN",
84
+ Label.UNKNOWN,
85
+ source="observed_bytes or total_params is zero",
86
+ ),
87
+ )
88
+
89
+ candidates: list[ReconciliationCandidate] = []
90
+ for scheme, bpp in _QUANT_BPP.items():
91
+ if scheme == "UNKNOWN" or bpp == 0.0:
92
+ continue
93
+ predicted = int(bpp * total_params)
94
+ delta = observed_bytes - predicted
95
+ rel_err = abs(delta) / predicted if predicted else float("inf")
96
+ candidates.append(
97
+ ReconciliationCandidate(
98
+ scheme=scheme,
99
+ predicted_bytes=predicted,
100
+ delta_bytes=delta,
101
+ relative_error=rel_err,
102
+ )
103
+ )
104
+ candidates.sort(key=lambda c: c.relative_error)
105
+
106
+ argmin_scheme = candidates[0].scheme
107
+ argmin_err = candidates[0].relative_error
108
+
109
+ # Fingerprint path: authoritative declaration from config.json or safetensors
110
+ # header. This is the primary fix for the tie that LLM review caught.
111
+ if fingerprint is not None:
112
+ return _reconcile_with_fingerprint(
113
+ observed_bytes=observed_bytes,
114
+ total_params=total_params,
115
+ candidates=tuple(candidates),
116
+ fingerprint=fingerprint,
117
+ argmin_scheme=argmin_scheme,
118
+ argmin_err=argmin_err,
119
+ )
120
+
121
+ # Tolerance gate without fingerprint
122
+ if argmin_err > _UNKNOWN_THRESHOLD:
123
+ return ReconciliationReport(
124
+ observed_bytes=observed_bytes,
125
+ total_params=total_params,
126
+ candidates=tuple(candidates),
127
+ best=AnnotatedValue(
128
+ "UNKNOWN",
129
+ Label.UNKNOWN,
130
+ source=(
131
+ f"closest candidate ({argmin_scheme}) is off by "
132
+ f"{argmin_err * 100:.1f}% — no confident match"
133
+ ),
134
+ ),
135
+ )
136
+
137
+ # Bytes-only tie detection
138
+ tied_schemes = [
139
+ c.scheme
140
+ for c in candidates
141
+ if abs(c.relative_error - argmin_err) < _TIE_THRESHOLD
142
+ and c.relative_error <= _UNKNOWN_THRESHOLD
143
+ ]
144
+ if len(tied_schemes) > 1:
145
+ tie_note = (
146
+ f" — tied with {', '.join(s for s in tied_schemes if s != argmin_scheme)} "
147
+ f"at the same bits/param; distinguishing requires config.json "
148
+ f"quantization_config or safetensors per-tensor dtype "
149
+ f"(neither available for this model)"
150
+ )
151
+ source_text = (
152
+ f"best match among {len(candidates)} candidates, "
153
+ f"{argmin_err * 100:.1f}% error{tie_note}"
154
+ )
155
+ else:
156
+ source_text = (
157
+ f"best match among {len(candidates)} candidates, {argmin_err * 100:.1f}% error"
158
+ )
159
+
160
+ return ReconciliationReport(
161
+ observed_bytes=observed_bytes,
162
+ total_params=total_params,
163
+ candidates=tuple(candidates),
164
+ best=AnnotatedValue(argmin_scheme, Label.INFERRED, source=source_text),
165
+ )
166
+
167
+
168
+ def _reconcile_with_fingerprint(
169
+ observed_bytes: int,
170
+ total_params: int,
171
+ candidates: tuple[ReconciliationCandidate, ...],
172
+ fingerprint: QuantFingerprint,
173
+ argmin_scheme: QuantizationScheme,
174
+ argmin_err: float,
175
+ ) -> ReconciliationReport:
176
+ """Fingerprint-driven path.
177
+
178
+ Rules:
179
+ - If the declared scheme is in the candidates AND its bytes-error is within
180
+ tolerance → adopt it. Label VERIFIED (we're reading authoritative metadata,
181
+ not inferring).
182
+ - If declared scheme's bytes-error is > 15% → conflict. Still adopt the
183
+ declared scheme but log the discrepancy. This usually means our param
184
+ estimate is off, not that the declaration is wrong.
185
+ - If declared scheme is unknown to us → fall back to argmin with note.
186
+ """
187
+ declared = fingerprint.scheme
188
+ match = next((c for c in candidates if c.scheme == declared), None)
189
+
190
+ if match is None:
191
+ # Unknown scheme from fingerprint — degrade gracefully to bytes-only.
192
+ return ReconciliationReport(
193
+ observed_bytes=observed_bytes,
194
+ total_params=total_params,
195
+ candidates=candidates,
196
+ best=AnnotatedValue(
197
+ argmin_scheme,
198
+ Label.INFERRED,
199
+ source=(
200
+ f"fingerprint declared {declared} ({fingerprint.evidence}) "
201
+ f"but we have no bpp anchor for it; fell back to bytes match "
202
+ f"{argmin_scheme} at {argmin_err * 100:.1f}% error"
203
+ ),
204
+ ),
205
+ )
206
+
207
+ if match.relative_error <= _UNKNOWN_THRESHOLD:
208
+ # Agreement — fingerprint picks a plausible scheme. This is the happy path.
209
+ note = ""
210
+ # Extra context: if bytes alone would have chosen a different scheme, say so.
211
+ if declared != argmin_scheme and argmin_err < match.relative_error:
212
+ note = (
213
+ f" (bytes alone would argmin to {argmin_scheme} at "
214
+ f"{argmin_err * 100:.1f}%; we trust the declaration)"
215
+ )
216
+ return ReconciliationReport(
217
+ observed_bytes=observed_bytes,
218
+ total_params=total_params,
219
+ candidates=candidates,
220
+ best=AnnotatedValue(
221
+ declared,
222
+ Label.VERIFIED,
223
+ source=(
224
+ f"{fingerprint.evidence} "
225
+ f"(predicts {match.predicted_bytes:,} bytes, "
226
+ f"{match.relative_error * 100:.1f}% error){note}"
227
+ ),
228
+ ),
229
+ )
230
+
231
+ # Disagreement: declared scheme's prediction is >15% off from observed bytes.
232
+ # Still trust the declaration — usually means our param estimate drifted.
233
+ return ReconciliationReport(
234
+ observed_bytes=observed_bytes,
235
+ total_params=total_params,
236
+ candidates=candidates,
237
+ best=AnnotatedValue(
238
+ declared,
239
+ Label.VERIFIED,
240
+ source=(
241
+ f"{fingerprint.evidence} "
242
+ f"(NOTE: bytes predict {match.predicted_bytes:,}, off by "
243
+ f"{match.relative_error * 100:.1f}% — likely our param estimate is off, "
244
+ f"not the declaration)"
245
+ ),
246
+ ),
247
+ )
src/llm_cal/weight_analyzer/safetensors_reader.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Fetch the safetensors header of one shard to recover per-tensor dtypes.
2
+
3
+ The safetensors binary format:
4
+ bytes[0..8] uint64 little-endian header length N (JSON bytes)
5
+ bytes[8..8+N] UTF-8 JSON tensor_name -> {dtype, shape, data_offsets}
6
+ bytes[8+N..] raw tensor data (we never read this)
7
+
8
+ So we can identify every tensor's dtype without downloading any weight bytes.
9
+ Headers are usually 50 KB - 2 MB. We cap the Range request at 16 MB as a
10
+ safety net; anything larger is treated as malformed.
11
+
12
+ This module NEVER raises on network or parse error — it returns None so
13
+ the caller can degrade gracefully. The honesty principle: "we tried and
14
+ failed to resolve the tie" is a legitimate outcome, not a fatal error.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import json
20
+ import struct
21
+ from typing import Any
22
+
23
+ import httpx
24
+
25
+ from llm_cal.model_source.auth import get_hf_token, get_modelscope_token
26
+ from llm_cal.model_source.base import SiblingFile
27
+
28
+ _MAX_HEADER_BYTES = 16 * 1024 * 1024 # 16 MB — far above any realistic header
29
+ _RANGE_FETCH_BYTES = 16 * 1024 * 1024
30
+ _DEFAULT_TIMEOUT_S = 15.0
31
+
32
+
33
+ def pick_sample_shard(siblings: tuple[SiblingFile, ...]) -> SiblingFile | None:
34
+ """Choose one safetensors file that's representative of the model.
35
+
36
+ Preference order:
37
+ 1. `model.safetensors` (single-file case — always representative)
38
+ 2. The middle shard for multi-shard models. The first shard tends to
39
+ contain embeddings + lm_head + early-layer norms (often left in
40
+ BF16/FP16 even when the bulk of the model is quantized to FP4 or
41
+ FP8). The middle shard typically holds real decoder/MoE-expert
42
+ weights, so its dtype histogram is more representative of the
43
+ "headline" quantization.
44
+ 3. Any `*.safetensors` if naming doesn't follow the shard convention.
45
+ """
46
+ st_files = [s for s in siblings if s.filename.endswith(".safetensors")]
47
+ if not st_files:
48
+ return None
49
+
50
+ for s in st_files:
51
+ if s.filename == "model.safetensors":
52
+ return s
53
+
54
+ sorted_shards = sorted(st_files, key=lambda s: s.filename)
55
+ return sorted_shards[len(sorted_shards) // 2]
56
+
57
+
58
+ def fetch_tensor_dtypes(
59
+ source: str,
60
+ model_id: str,
61
+ revision: str,
62
+ shard_filename: str,
63
+ endpoint: str | None = None,
64
+ timeout_s: float = _DEFAULT_TIMEOUT_S,
65
+ ) -> dict[str, str] | None:
66
+ """Range-fetch the safetensors header of one shard and return dtype map.
67
+
68
+ Returns a dict of `{tensor_name: dtype_string}` on success, None on any
69
+ failure (network, parse, unexpected format). Non-fatal by design.
70
+
71
+ Supports HuggingFace and ModelScope. Other sources fall back to None
72
+ so the reconciler still reports a verdict (without per-tensor refinement).
73
+ """
74
+ url, headers = _build_request(source, model_id, revision, shard_filename, endpoint)
75
+ if url is None:
76
+ return None
77
+
78
+ headers = {**headers, "Range": f"bytes=0-{_RANGE_FETCH_BYTES - 1}"}
79
+
80
+ try:
81
+ resp = httpx.get(url, headers=headers, timeout=timeout_s, follow_redirects=True)
82
+ except (httpx.TimeoutException, httpx.ConnectError, httpx.HTTPError):
83
+ return None
84
+
85
+ # 200 for small files returned in full; 206 for actual Range response.
86
+ # Anything else (404, 403, 500, ...) we degrade silently.
87
+ if resp.status_code not in (200, 206):
88
+ return None
89
+
90
+ return parse_header(resp.content)
91
+
92
+
93
+ def _build_request(
94
+ source: str,
95
+ model_id: str,
96
+ revision: str,
97
+ shard_filename: str,
98
+ endpoint: str | None,
99
+ ) -> tuple[str | None, dict[str, str]]:
100
+ """Compose URL + auth headers for the source. Returns (None, {}) on unknown."""
101
+ if source == "huggingface":
102
+ base = (endpoint or "https://huggingface.co").rstrip("/")
103
+ url = f"{base}/{model_id}/resolve/{revision}/{shard_filename}"
104
+ token = get_hf_token()
105
+ headers = {"Authorization": f"Bearer {token}"} if token else {}
106
+ return url, headers
107
+ if source == "modelscope":
108
+ # ModelScope raw-file endpoint takes the path via query string and
109
+ # 302-redirects to the underlying OSS object. httpx follows the
110
+ # redirect; OSS honors Range natively.
111
+ base = (endpoint or "https://www.modelscope.cn").rstrip("/")
112
+ # httpx will encode query params; build manually to keep this function
113
+ # ergonomically a one-liner that matches the rest of the module.
114
+ url = (
115
+ f"{base}/api/v1/models/{model_id}/repo"
116
+ f"?FilePath={shard_filename}&Revision={revision}"
117
+ )
118
+ token = get_modelscope_token()
119
+ headers = {"Authorization": f"Bearer {token}"} if token else {}
120
+ return url, headers
121
+ return None, {}
122
+
123
+
124
+ def parse_header(content: bytes) -> dict[str, str] | None:
125
+ """Parse the safetensors binary header from a leading byte buffer.
126
+
127
+ Pure function — safe to call on any bytes. Returns None on any malformed
128
+ input rather than raising.
129
+ """
130
+ if len(content) < 8:
131
+ return None
132
+
133
+ try:
134
+ (header_len,) = struct.unpack("<Q", content[:8])
135
+ except struct.error:
136
+ return None
137
+
138
+ if header_len == 0 or header_len > _MAX_HEADER_BYTES:
139
+ return None
140
+
141
+ if len(content) < 8 + header_len:
142
+ return None
143
+
144
+ header_bytes = content[8 : 8 + header_len]
145
+ try:
146
+ header: Any = json.loads(header_bytes)
147
+ except (json.JSONDecodeError, UnicodeDecodeError):
148
+ return None
149
+
150
+ if not isinstance(header, dict):
151
+ return None
152
+
153
+ dtypes: dict[str, str] = {}
154
+ for name, info in header.items():
155
+ if name == "__metadata__":
156
+ continue
157
+ if not isinstance(info, dict):
158
+ continue
159
+ dtype = info.get("dtype")
160
+ if isinstance(dtype, str):
161
+ dtypes[name] = dtype
162
+
163
+ return dtypes if dtypes else None