Spaces:
Running
Running
GitHub Actions commited on
Commit ·
cc6274a
0
Parent(s):
Auto-deploy from GitHub Actions
Browse files- README.md +38 -0
- app.py +2376 -0
- requirements.txt +2 -0
- src/llm_cal/__init__.py +6 -0
- src/llm_cal/architecture/__init__.py +0 -0
- src/llm_cal/architecture/detector.py +134 -0
- src/llm_cal/architecture/formulas/__init__.py +0 -0
- src/llm_cal/architecture/formulas/kv_cache.py +145 -0
- src/llm_cal/architecture/formulas/weight.py +133 -0
- src/llm_cal/architecture/profile.py +97 -0
- src/llm_cal/architecture/traits.py +150 -0
- src/llm_cal/benchmark/__init__.py +0 -0
- src/llm_cal/benchmark/dataset.yaml +203 -0
- src/llm_cal/benchmark/runner.py +232 -0
- src/llm_cal/cli.py +207 -0
- src/llm_cal/command_generator/__init__.py +0 -0
- src/llm_cal/command_generator/sglang.py +50 -0
- src/llm_cal/command_generator/vllm.py +55 -0
- src/llm_cal/common/__init__.py +0 -0
- src/llm_cal/common/i18n.py +421 -0
- src/llm_cal/common/yaml_loader.py +48 -0
- src/llm_cal/core/__init__.py +0 -0
- src/llm_cal/core/cache.py +97 -0
- src/llm_cal/core/evaluator.py +375 -0
- src/llm_cal/core/explain.py +504 -0
- src/llm_cal/engine_compat/__init__.py +0 -0
- src/llm_cal/engine_compat/loader.py +118 -0
- src/llm_cal/engine_compat/matrix.yaml +512 -0
- src/llm_cal/fleet/__init__.py +0 -0
- src/llm_cal/fleet/planner.py +282 -0
- src/llm_cal/hardware/__init__.py +0 -0
- src/llm_cal/hardware/gpu_database.yaml +613 -0
- src/llm_cal/hardware/loader.py +77 -0
- src/llm_cal/llm_review/__init__.py +0 -0
- src/llm_cal/llm_review/reviewer.py +218 -0
- src/llm_cal/model_source/__init__.py +0 -0
- src/llm_cal/model_source/auth.py +33 -0
- src/llm_cal/model_source/base.py +58 -0
- src/llm_cal/model_source/huggingface.py +118 -0
- src/llm_cal/model_source/modelscope.py +229 -0
- src/llm_cal/output/__init__.py +0 -0
- src/llm_cal/output/formatter.py +665 -0
- src/llm_cal/output/labels.py +46 -0
- src/llm_cal/performance/__init__.py +0 -0
- src/llm_cal/performance/compute.py +233 -0
- src/llm_cal/performance/concurrency.py +132 -0
- src/llm_cal/weight_analyzer/__init__.py +146 -0
- src/llm_cal/weight_analyzer/fingerprint.py +292 -0
- src/llm_cal/weight_analyzer/reconciler.py +247 -0
- src/llm_cal/weight_analyzer/safetensors_reader.py +163 -0
README.md
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: llm-cal
|
| 3 |
+
emoji: 🧮
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: blue
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 6.13.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: apache-2.0
|
| 11 |
+
short_description: LLM inference sizing — honest, architecture-aware
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
# llm-cal — LLM inference hardware calculator
|
| 15 |
+
|
| 16 |
+
Web UI for [`llm-cal`](https://github.com/FlyTOmeLight/llm-cal). Pick a model, pick a GPU, get a hardware plan.
|
| 17 |
+
|
| 18 |
+
Architecture-aware (MLA, NSA, CSA+HCA, MoE, sliding window). Engine-aware (vLLM, SGLang). Honest-labeled — every number carries a provenance tag (`[verified]` / `[inferred]` / `[estimated]` / `[cited]` / `[unverified]` / `[unknown]`).
|
| 19 |
+
|
| 20 |
+
## The story this Space exists to tell
|
| 21 |
+
|
| 22 |
+
`gpu_poor` reports DeepSeek-V4-Flash as 284 GB by assuming pure FP8. The real safetensors weight is 160 GB — it ships an FP4+FP8 mixed pack. `llm-cal` reads the actual on-disk dtype (per-tensor metadata + MX block-scaled scale tensors) and gets 160.01 GB at **0.2% error**.
|
| 23 |
+
|
| 24 |
+
That's the whole pitch.
|
| 25 |
+
|
| 26 |
+
## Local
|
| 27 |
+
|
| 28 |
+
```bash
|
| 29 |
+
pip install llm-cal gradio
|
| 30 |
+
python app.py
|
| 31 |
+
```
|
| 32 |
+
|
| 33 |
+
## Links
|
| 34 |
+
|
| 35 |
+
- [GitHub repo](https://github.com/FlyTOmeLight/llm-cal)
|
| 36 |
+
- [Full docs](https://flytomelight.github.io/llm-cal/)
|
| 37 |
+
- [Methodology](https://flytomelight.github.io/llm-cal/methodology/) — every formula's primary source
|
| 38 |
+
- [Pre-rendered model pages](https://flytomelight.github.io/llm-cal/models/) — popular model × GPU combos
|
app.py
ADDED
|
@@ -0,0 +1,2376 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""llm-cal Gradio web app — deploys to HuggingFace Spaces.
|
| 2 |
+
|
| 3 |
+
User journey:
|
| 4 |
+
1. Type a HuggingFace model id (or pick from examples)
|
| 5 |
+
2. Choose target GPU
|
| 6 |
+
3. Hit Calculate
|
| 7 |
+
4. Read the same `--explain`-quality output the CLI gives you, but in a browser
|
| 8 |
+
and shareable via URL parameters.
|
| 9 |
+
|
| 10 |
+
The whole compute is the existing Python `Evaluator`. No new logic.
|
| 11 |
+
|
| 12 |
+
Local run:
|
| 13 |
+
python web/app.py
|
| 14 |
+
HF Spaces:
|
| 15 |
+
This file is the entry point Spaces expects. requirements.txt sits next to it.
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
from __future__ import annotations
|
| 19 |
+
|
| 20 |
+
import sys
|
| 21 |
+
from pathlib import Path
|
| 22 |
+
|
| 23 |
+
# Ensure src/ is importable. Two layouts supported:
|
| 24 |
+
# 1. Local dev: /repo/web/app.py + /repo/src/ (parent.parent / src)
|
| 25 |
+
# 2. HF Space: /space/app.py + /space/src/ (parent / src)
|
| 26 |
+
# The deploy workflow flattens layout 1 → layout 2 when pushing to the Space.
|
| 27 |
+
_HERE = Path(__file__).resolve().parent
|
| 28 |
+
for _candidate in (_HERE / "src", _HERE.parent / "src"):
|
| 29 |
+
if _candidate.exists():
|
| 30 |
+
sys.path.insert(0, str(_candidate))
|
| 31 |
+
break
|
| 32 |
+
|
| 33 |
+
import os # noqa: E402
|
| 34 |
+
|
| 35 |
+
import gradio as gr # noqa: E402
|
| 36 |
+
|
| 37 |
+
from llm_cal.common.i18n import set_locale, t # noqa: E402
|
| 38 |
+
from llm_cal.core.evaluator import EvaluationReport, Evaluator # noqa: E402
|
| 39 |
+
from llm_cal.core.explain import ExplainEntry # noqa: E402
|
| 40 |
+
from llm_cal.core.explain import build as build_explain # noqa: E402
|
| 41 |
+
from llm_cal.hardware.loader import load_database # noqa: E402
|
| 42 |
+
from llm_cal.llm_review.reviewer import run_review # noqa: E402
|
| 43 |
+
from llm_cal.model_source.huggingface import HuggingFaceSource # noqa: E402
|
| 44 |
+
from llm_cal.model_source.modelscope import ModelScopeSource # noqa: E402
|
| 45 |
+
|
| 46 |
+
# ---------------------------------------------------------------------------
|
| 47 |
+
# Static data the UI needs
|
| 48 |
+
|
| 49 |
+
_DB = load_database()
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def _classify_vendor(gpu_id: str) -> tuple[str, str]:
|
| 53 |
+
"""Map a GPU id to (vendor_en, vendor_zh).
|
| 54 |
+
|
| 55 |
+
Vendor isn't in the YAML schema (yet), so derive from the id prefix.
|
| 56 |
+
"""
|
| 57 |
+
gid = gpu_id.upper()
|
| 58 |
+
if gid in {"B200", "GB200", "H100", "H800", "H200", "H20", "GH200"} or gid.startswith(
|
| 59 |
+
("L4", "L40", "RTX", "A10", "A100", "A40", "V100", "T4")
|
| 60 |
+
):
|
| 61 |
+
return ("NVIDIA", "NVIDIA")
|
| 62 |
+
if gid.startswith("MI"):
|
| 63 |
+
return ("AMD", "AMD")
|
| 64 |
+
if gid.startswith("GAUDI"):
|
| 65 |
+
return ("Intel Habana", "英特尔 Habana")
|
| 66 |
+
if gid.startswith("910") or gid.startswith("ATLAS"):
|
| 67 |
+
return ("Huawei Ascend", "华为昇腾")
|
| 68 |
+
if gid.startswith("MXC"):
|
| 69 |
+
return ("MetaX 沐曦", "沐曦 MetaX")
|
| 70 |
+
if gid.startswith("KUNLUN"):
|
| 71 |
+
return ("Kunlunxin 昆仑芯", "昆仑芯 Kunlunxin")
|
| 72 |
+
if gid.startswith("BR"):
|
| 73 |
+
return ("Biren 壁仞", "壁仞 Biren")
|
| 74 |
+
if gid.startswith("BI-"):
|
| 75 |
+
return ("Iluvatar 天数智芯", "天数智芯 Iluvatar")
|
| 76 |
+
if gid.startswith(("MR-", "MTT")):
|
| 77 |
+
return ("Moore Threads 摩尔线程", "摩尔线程 Moore Threads")
|
| 78 |
+
if gid.startswith("MLU"):
|
| 79 |
+
return ("Cambricon 寒武纪", "寒武纪 Cambricon")
|
| 80 |
+
if gid.startswith("HYGON"):
|
| 81 |
+
return ("Hygon 海光", "海光 Hygon")
|
| 82 |
+
return ("Other", "其他")
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
# Stable vendor display order
|
| 86 |
+
_VENDOR_ORDER = [
|
| 87 |
+
"NVIDIA",
|
| 88 |
+
"AMD",
|
| 89 |
+
"Intel Habana",
|
| 90 |
+
"Huawei Ascend",
|
| 91 |
+
"MetaX 沐曦",
|
| 92 |
+
"Kunlunxin 昆仑芯",
|
| 93 |
+
"Biren 壁仞",
|
| 94 |
+
"Iluvatar 天数智芯",
|
| 95 |
+
"Moore Threads 摩尔线程",
|
| 96 |
+
"Cambricon 寒武纪",
|
| 97 |
+
"Hygon 海光",
|
| 98 |
+
"Other",
|
| 99 |
+
]
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def _build_vendor_index() -> dict[str, list[str]]:
|
| 103 |
+
"""vendor_en -> sorted list of GPU ids"""
|
| 104 |
+
out: dict[str, list[str]] = {v: [] for v in _VENDOR_ORDER}
|
| 105 |
+
for g in _DB.gpus:
|
| 106 |
+
v_en, _ = _classify_vendor(g.id)
|
| 107 |
+
out.setdefault(v_en, []).append(g.id)
|
| 108 |
+
for v in out:
|
| 109 |
+
out[v].sort()
|
| 110 |
+
# Drop empty buckets
|
| 111 |
+
return {v: ids for v, ids in out.items() if ids}
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
_VENDOR_TO_GPUS = _build_vendor_index()
|
| 115 |
+
VENDOR_CHOICES_EN: list[str] = list(_VENDOR_TO_GPUS.keys())
|
| 116 |
+
DEFAULT_VENDOR = "NVIDIA"
|
| 117 |
+
DEFAULT_GPU = "H800"
|
| 118 |
+
|
| 119 |
+
EXAMPLE_MODELS: list[tuple[str, str, str, str, str]] = [
|
| 120 |
+
# (model_id, vendor, gpu, engine, source)
|
| 121 |
+
("deepseek-ai/DeepSeek-V4-Flash", "NVIDIA", "H800", "vllm", "HuggingFace"),
|
| 122 |
+
("deepseek-ai/DeepSeek-V3", "NVIDIA", "H800", "vllm", "HuggingFace"),
|
| 123 |
+
("Qwen/Qwen2.5-72B-Instruct", "NVIDIA", "H100", "vllm", "HuggingFace"),
|
| 124 |
+
("Qwen/Qwen3-30B-A3B", "NVIDIA", "A100-80G", "vllm", "HuggingFace"),
|
| 125 |
+
("mistralai/Mixtral-8x7B-v0.1", "NVIDIA", "H100", "vllm", "HuggingFace"),
|
| 126 |
+
("microsoft/Phi-4", "NVIDIA", "RTX4090", "vllm", "HuggingFace"),
|
| 127 |
+
("deepseek-ai/DeepSeek-V4-Flash", "Huawei Ascend", "910B4", "vllm", "HuggingFace"),
|
| 128 |
+
# ModelScope examples — same models, China-side mirror.
|
| 129 |
+
("Qwen/Qwen3-30B-A3B", "NVIDIA", "A100-80G", "vllm", "ModelScope"),
|
| 130 |
+
("deepseek-ai/DeepSeek-V3", "Huawei Ascend", "910B4", "vllm", "ModelScope"),
|
| 131 |
+
]
|
| 132 |
+
|
| 133 |
+
# ---------------------------------------------------------------------------
|
| 134 |
+
# Output rendering
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def _fmt_bytes(n: int | None) -> str:
|
| 138 |
+
if n is None:
|
| 139 |
+
return "—"
|
| 140 |
+
if n < 1024:
|
| 141 |
+
return f"{n} B"
|
| 142 |
+
f = float(n)
|
| 143 |
+
for u in ["KB", "MB", "GB", "TB"]:
|
| 144 |
+
f /= 1024
|
| 145 |
+
if f < 1024:
|
| 146 |
+
return f"{f:.2f} {u}"
|
| 147 |
+
return f"{f:.2f} PB"
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def _fmt_params(n: int | None) -> str:
|
| 151 |
+
if not n:
|
| 152 |
+
return "—"
|
| 153 |
+
if n >= 1_000_000_000:
|
| 154 |
+
return f"{n / 1_000_000_000:.1f}B"
|
| 155 |
+
if n >= 1_000_000:
|
| 156 |
+
return f"{n / 1_000_000:.1f}M"
|
| 157 |
+
return f"{n:,}"
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def _label_color(label: str) -> str:
|
| 161 |
+
"""Map a provenance label to a CSS color (visible in both light and dark)."""
|
| 162 |
+
return {
|
| 163 |
+
"verified": "#16a34a", # green-600
|
| 164 |
+
"inferred": "#2563eb", # blue-600
|
| 165 |
+
"estimated": "#d97706", # amber-600
|
| 166 |
+
"cited": "#7c3aed", # violet-600
|
| 167 |
+
"unverified": "#9a3412", # orange-800
|
| 168 |
+
"unknown": "#6b7280", # gray-500
|
| 169 |
+
"llm-opinion": "#db2777", # pink-600
|
| 170 |
+
}.get(label, "#6b7280")
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def _label_chip(label_key: str) -> str:
|
| 174 |
+
"""Render a [label] chip with the right color."""
|
| 175 |
+
color = _label_color(label_key)
|
| 176 |
+
text = t(f"label.{label_key}")
|
| 177 |
+
return (
|
| 178 |
+
f'<span class="lc-chip" style="background:{color}1a;color:{color};'
|
| 179 |
+
f'border:1px solid {color}55">{text}</span>'
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def _stat_card(label: str, value: str, sublabel: str = "", chip: str = "") -> str:
|
| 184 |
+
chip_html = f"<div class='lc-stat-chip'>{chip}</div>" if chip else ""
|
| 185 |
+
sub_html = f"<div class='lc-stat-sub'>{sublabel}</div>" if sublabel else ""
|
| 186 |
+
return (
|
| 187 |
+
f"<div class='lc-stat'>"
|
| 188 |
+
f"<div class='lc-stat-value'>{value}</div>"
|
| 189 |
+
f"<div class='lc-stat-label'>{label}</div>"
|
| 190 |
+
f"{sub_html}{chip_html}"
|
| 191 |
+
f"</div>"
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def _esc(s: str) -> str:
|
| 196 |
+
return (
|
| 197 |
+
str(s)
|
| 198 |
+
.replace("&", "&")
|
| 199 |
+
.replace("<", "<")
|
| 200 |
+
.replace(">", ">")
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
def _render(report: EvaluationReport, locale: str) -> str:
|
| 205 |
+
set_locale(locale) # type: ignore[arg-type]
|
| 206 |
+
is_zh = locale == "zh"
|
| 207 |
+
|
| 208 |
+
p, w, r, f = report.profile, report.weight, report.reconciliation, report.fleet
|
| 209 |
+
|
| 210 |
+
# ---- Headline stat cards -------------------------------------------------
|
| 211 |
+
weight_str = _fmt_bytes(w.total_bytes.value)
|
| 212 |
+
weight_chip = _label_chip(w.total_bytes.label.value)
|
| 213 |
+
quant_chip = _label_chip(w.quantization_guess.label.value)
|
| 214 |
+
prod_opt = (
|
| 215 |
+
next((o for o in (f.options if f else []) if o.tier == "prod"), None) if f else None
|
| 216 |
+
)
|
| 217 |
+
prod_gpus = str(prod_opt.gpu_count) if prod_opt else "—"
|
| 218 |
+
prod_concurrent = str(prod_opt.max_concurrent_at_reference_ctx) if prod_opt else "—"
|
| 219 |
+
|
| 220 |
+
headline = (
|
| 221 |
+
f"<div class='lc-header'>"
|
| 222 |
+
f"<div class='lc-title'>{_esc(report.model_id)}</div>"
|
| 223 |
+
f"<div class='lc-subtitle'>"
|
| 224 |
+
f"{_esc(report.gpu)} · {_esc(report.engine)}"
|
| 225 |
+
f"</div></div>"
|
| 226 |
+
f"<div class='lc-stats'>"
|
| 227 |
+
f"{_stat_card('Weight' if not is_zh else '权重', weight_str, sublabel='from safetensors API' if not is_zh else '取自 safetensors API', chip=weight_chip)}"
|
| 228 |
+
f"{_stat_card('Quantization' if not is_zh else '量化', _esc(w.quantization_guess.value), sublabel='resolved scheme' if not is_zh else '已识别方案', chip=quant_chip)}"
|
| 229 |
+
f"{_stat_card('Prod GPUs' if not is_zh else 'Prod GPU 数', prod_gpus, sublabel='for 16-user prod' if not is_zh else '生产档(16 路并发)')}"
|
| 230 |
+
f"{_stat_card('Users @ 128K' if not is_zh else '用户 @ 128K', prod_concurrent, sublabel='concurrent at prod tier' if not is_zh else '生产档的并发')}"
|
| 231 |
+
f"</div>"
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
# Provenance footer for the headline
|
| 235 |
+
quant_source = _esc(w.quantization_guess.source or "")
|
| 236 |
+
headline += f"<div class='lc-prov'>{quant_source}</div>"
|
| 237 |
+
|
| 238 |
+
# ---- Architecture --------------------------------------------------------
|
| 239 |
+
arch_rows: list[tuple[str, str]] = [("model_type", p.model_type)]
|
| 240 |
+
if p.attention:
|
| 241 |
+
arch_rows.append(
|
| 242 |
+
(
|
| 243 |
+
"attention",
|
| 244 |
+
f"{p.attention.variant} (heads={p.attention.num_heads}, "
|
| 245 |
+
f"kv_heads={p.attention.num_kv_heads}, hd={p.attention.head_dim})",
|
| 246 |
+
)
|
| 247 |
+
)
|
| 248 |
+
if p.moe:
|
| 249 |
+
arch_rows.append(
|
| 250 |
+
(
|
| 251 |
+
"moe",
|
| 252 |
+
f"{p.moe.num_routed_experts} routed + "
|
| 253 |
+
f"{p.moe.num_shared_experts} shared, top-{p.moe.num_experts_per_tok}",
|
| 254 |
+
)
|
| 255 |
+
)
|
| 256 |
+
if p.sliding_window:
|
| 257 |
+
arch_rows.append(("sliding_window", str(p.sliding_window)))
|
| 258 |
+
|
| 259 |
+
arch_html = "".join(
|
| 260 |
+
f"<tr><td><code>{_esc(k)}</code></td><td><code>{_esc(v)}</code></td></tr>"
|
| 261 |
+
for k, v in arch_rows
|
| 262 |
+
)
|
| 263 |
+
arch_explainer = (
|
| 264 |
+
"从模型 config.json 读出来的,决定后续所有公式怎么走(是否分组注意力、是否 MoE、是否滑动窗口)。"
|
| 265 |
+
if is_zh
|
| 266 |
+
else "Read straight from the model's config.json. Drives every formula "
|
| 267 |
+
"downstream — attention sharding, MoE active-expert ratio, sliding window."
|
| 268 |
+
)
|
| 269 |
+
arch_section = (
|
| 270 |
+
f"<div class='lc-section'><h3>{'架构' if is_zh else 'Architecture'}</h3>"
|
| 271 |
+
f"<div class='lc-section-help'>{arch_explainer}</div>"
|
| 272 |
+
f"<table class='lc-table'>{arch_html}</table></div>"
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
# ---- Reconciliation ------------------------------------------------------
|
| 276 |
+
recon_rows = []
|
| 277 |
+
for c in r.candidates[:5]:
|
| 278 |
+
is_best = c.scheme == r.best.value
|
| 279 |
+
cls = " class='lc-best'" if is_best else ""
|
| 280 |
+
marker = " ✓" if is_best else ""
|
| 281 |
+
recon_rows.append(
|
| 282 |
+
f"<tr{cls}><td><code>{_esc(c.scheme)}</code>{marker}</td>"
|
| 283 |
+
f"<td>{_fmt_bytes(c.predicted_bytes)}</td>"
|
| 284 |
+
f"<td>{c.relative_error * 100:.1f}%</td></tr>"
|
| 285 |
+
)
|
| 286 |
+
recon_explainer = (
|
| 287 |
+
"用每种量化方案预测应该有多少字节,跟实际 safetensors 字节对比。误差最小的胜出。"
|
| 288 |
+
"FP4_FP8_MIXED / GPTQ_INT4 / AWQ_INT4 在 0.55 bpp 处会打平,需要 config 或 dtype 进一步区分。"
|
| 289 |
+
if is_zh
|
| 290 |
+
else "Predict bytes under each quantization hypothesis, compare against the real "
|
| 291 |
+
"safetensors size. Lowest error wins. FP4_FP8_MIXED / GPTQ_INT4 / AWQ_INT4 tie "
|
| 292 |
+
"at 0.55 bpp — broken via config.json or per-tensor dtype."
|
| 293 |
+
)
|
| 294 |
+
recon_section = (
|
| 295 |
+
f"<div class='lc-section'>"
|
| 296 |
+
f"<h3>{'量化反演' if is_zh else 'Quantization reconciliation'}</h3>"
|
| 297 |
+
f"<div class='lc-section-help'>{recon_explainer}</div>"
|
| 298 |
+
f"<table class='lc-table lc-table-recon'>"
|
| 299 |
+
f"<thead><tr><th>Scheme</th>"
|
| 300 |
+
f"<th>{'预测字节' if is_zh else 'Predicted'}</th>"
|
| 301 |
+
f"<th>{'误差' if is_zh else 'Error'}</th></tr></thead>"
|
| 302 |
+
f"<tbody>{''.join(recon_rows)}</tbody></table></div>"
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
# ---- Fleet ---------------------------------------------------------------
|
| 306 |
+
fleet_section = ""
|
| 307 |
+
if f and f.options:
|
| 308 |
+
# Pick which context lengths get their own concurrency column.
|
| 309 |
+
# Always include 128K if any option has it; also include the model max
|
| 310 |
+
# if it's larger (e.g. 1M for DeepSeek-V4-Flash) so the user can compare
|
| 311 |
+
# "fits 23 users at 128K but only 2 at 1M".
|
| 312 |
+
all_ctxs: set[int] = set()
|
| 313 |
+
for opt in f.options:
|
| 314 |
+
for ctx, _ in opt.max_concurrent_by_context:
|
| 315 |
+
all_ctxs.add(ctx)
|
| 316 |
+
ctx_cols: list[int] = []
|
| 317 |
+
if 131_072 in all_ctxs:
|
| 318 |
+
ctx_cols.append(131_072)
|
| 319 |
+
max_ctx = max(all_ctxs) if all_ctxs else 0
|
| 320 |
+
if max_ctx > 131_072 and max_ctx not in ctx_cols:
|
| 321 |
+
ctx_cols.append(max_ctx)
|
| 322 |
+
if not ctx_cols and all_ctxs:
|
| 323 |
+
ctx_cols.append(max_ctx)
|
| 324 |
+
|
| 325 |
+
def _ctx_label(ctx: int) -> str:
|
| 326 |
+
if ctx >= 1_000_000:
|
| 327 |
+
return f"{ctx // 1_000_000}M" if ctx % 1_000_000 == 0 else f"{ctx / 1_000_000:.1f}M"
|
| 328 |
+
if ctx >= 1024:
|
| 329 |
+
return f"{ctx // 1024}K"
|
| 330 |
+
return str(ctx)
|
| 331 |
+
|
| 332 |
+
rows = []
|
| 333 |
+
for opt in f.options:
|
| 334 |
+
star = " ★" if opt.tier == f.best_tier else ""
|
| 335 |
+
cls = " class='lc-best'" if opt.tier == f.best_tier else ""
|
| 336 |
+
headroom = max(0, opt.usable_bytes_per_gpu - opt.weight_bytes_per_gpu)
|
| 337 |
+
ctx_map = dict(opt.max_concurrent_by_context)
|
| 338 |
+
ctx_cells = "".join(f"<td>{ctx_map.get(c, '—')}</td>" for c in ctx_cols)
|
| 339 |
+
rows.append(
|
| 340 |
+
f"<tr{cls}><td><code>{opt.tier}{star}</code></td>"
|
| 341 |
+
f"<td>{opt.gpu_count}</td>"
|
| 342 |
+
f"<td>{_fmt_bytes(opt.weight_bytes_per_gpu)}</td>"
|
| 343 |
+
f"<td>{_fmt_bytes(headroom)}</td>"
|
| 344 |
+
f"{ctx_cells}</tr>"
|
| 345 |
+
)
|
| 346 |
+
|
| 347 |
+
ctx_headers = "".join(
|
| 348 |
+
f"<th>{('@ ' + _ctx_label(c) + ' 并发') if is_zh else ('Concurrent @ ' + _ctx_label(c))}</th>"
|
| 349 |
+
for c in ctx_cols
|
| 350 |
+
)
|
| 351 |
+
fleet_explainer = (
|
| 352 |
+
"min = 刚好放得下;dev = 8 路并发场景;prod = 16 路并发场景。★ = 推荐。"
|
| 353 |
+
if is_zh
|
| 354 |
+
else "min = barely fits weights; dev = sized for 8 concurrent at 128K; "
|
| 355 |
+
"prod = sized for 16 concurrent at 128K. ★ = recommended."
|
| 356 |
+
)
|
| 357 |
+
fleet_section = (
|
| 358 |
+
f"<div class='lc-section'>"
|
| 359 |
+
f"<h3>{'推荐集群' if is_zh else 'Recommended fleet'}</h3>"
|
| 360 |
+
f"<div class='lc-section-help'>{fleet_explainer}</div>"
|
| 361 |
+
f"<table class='lc-table'>"
|
| 362 |
+
f"<thead><tr><th>Tier</th><th>GPUs</th>"
|
| 363 |
+
f"<th>Weight/GPU</th><th>Headroom/GPU</th>"
|
| 364 |
+
f"{ctx_headers}</tr></thead>"
|
| 365 |
+
f"<tbody>{''.join(rows)}</tbody></table></div>"
|
| 366 |
+
)
|
| 367 |
+
|
| 368 |
+
# ---- Performance ---------------------------------------------------------
|
| 369 |
+
perf_explainer = (
|
| 370 |
+
"Prefill 用算力公式(FLOPs = 2 × 参数 × 输入 token),decode 用带宽公式(吞吐 = 带宽 × 利用率 / 权重字节)。"
|
| 371 |
+
"Bottleneck 标 memory_bandwidth 说明 decode 是带宽瓶颈,加显存带宽更高的 GPU 比加算力更划算。"
|
| 372 |
+
if is_zh
|
| 373 |
+
else "Prefill uses the compute formula (FLOPs = 2 × params × input_tokens, Kaplan 2020). "
|
| 374 |
+
"Decode uses memory-bandwidth formula (tps = BW × util / weight_bytes, vLLM paper). "
|
| 375 |
+
"Bottleneck = memory_bandwidth means a higher-BW GPU helps more than more FLOPS."
|
| 376 |
+
)
|
| 377 |
+
perf_section = ""
|
| 378 |
+
if report.prefill and report.decode and report.concurrency:
|
| 379 |
+
max_users = report.concurrency.max_concurrent.value
|
| 380 |
+
bn = report.concurrency.bottleneck
|
| 381 |
+
items = [
|
| 382 |
+
(
|
| 383 |
+
"Prefill latency" if not is_zh else "Prefill 延迟",
|
| 384 |
+
f"{report.prefill.latency_ms.value:.0f} ms",
|
| 385 |
+
f"@ {report.perf_input_tokens or 2000} input tokens",
|
| 386 |
+
),
|
| 387 |
+
(
|
| 388 |
+
"Cluster decode" if not is_zh else "集群 decode 吞吐",
|
| 389 |
+
f"{report.decode.cluster_tokens_per_sec.value:.0f} tok/s",
|
| 390 |
+
"",
|
| 391 |
+
),
|
| 392 |
+
(
|
| 393 |
+
"Max concurrent users" if not is_zh else "最大并发用户",
|
| 394 |
+
str(max_users),
|
| 395 |
+
"",
|
| 396 |
+
),
|
| 397 |
+
(
|
| 398 |
+
"Bottleneck" if not is_zh else "瓶颈",
|
| 399 |
+
f"<code>{_esc(bn)}</code>",
|
| 400 |
+
"",
|
| 401 |
+
),
|
| 402 |
+
]
|
| 403 |
+
items_html = "".join(
|
| 404 |
+
f"<div class='lc-perf-item'>"
|
| 405 |
+
f"<div class='lc-perf-value'>{v}</div>"
|
| 406 |
+
f"<div class='lc-perf-label'>{_esc(label)}</div>"
|
| 407 |
+
f"<div class='lc-perf-sub'>{_esc(sub)}</div></div>"
|
| 408 |
+
for label, v, sub in items
|
| 409 |
+
)
|
| 410 |
+
perf_section = (
|
| 411 |
+
f"<div class='lc-section'>"
|
| 412 |
+
f"<h3>{'性能' if is_zh else 'Performance'}</h3>"
|
| 413 |
+
f"<div class='lc-section-help'>{perf_explainer}</div>"
|
| 414 |
+
f"<div class='lc-perf'>{items_html}</div></div>"
|
| 415 |
+
)
|
| 416 |
+
|
| 417 |
+
# ---- KV cache per request -----------------------------------------------
|
| 418 |
+
kv_section = ""
|
| 419 |
+
if report.kv_cache_by_context:
|
| 420 |
+
rows = []
|
| 421 |
+
for ctx, av in sorted(report.kv_cache_by_context.items()):
|
| 422 |
+
rows.append(
|
| 423 |
+
f"<tr><td>{ctx:,}</td><td>{_fmt_bytes(av.value)}</td>"
|
| 424 |
+
f"<td>{_label_chip(av.label.value)}</td></tr>"
|
| 425 |
+
)
|
| 426 |
+
kv_explainer = (
|
| 427 |
+
"单个请求在不同 context 长度下需要多少 KV 缓存。这是决定一张 GPU 能并发跑多少请求的关键。"
|
| 428 |
+
"MLA / MQA 模型这里会比标准 GQA 小很多。"
|
| 429 |
+
if is_zh
|
| 430 |
+
else "How much KV cache one request consumes at each context length. "
|
| 431 |
+
"This is what limits per-GPU concurrency. MLA / MQA models are "
|
| 432 |
+
"dramatically smaller here than standard GQA."
|
| 433 |
+
)
|
| 434 |
+
kv_section = (
|
| 435 |
+
f"<div class='lc-section'>"
|
| 436 |
+
f"<h3>{'KV 缓存(每请求)' if is_zh else 'KV cache per request'}</h3>"
|
| 437 |
+
f"<div class='lc-section-help'>{kv_explainer}</div>"
|
| 438 |
+
f"<table class='lc-table lc-table-recon'>"
|
| 439 |
+
f"<thead><tr><th>{'Context tokens' if not is_zh else 'Context 长度'}</th>"
|
| 440 |
+
f"<th>{'KV bytes' if not is_zh else 'KV 字节'}</th>"
|
| 441 |
+
f"<th>{'Label' if not is_zh else '标签'}</th></tr></thead>"
|
| 442 |
+
f"<tbody>{''.join(rows)}</tbody></table></div>"
|
| 443 |
+
)
|
| 444 |
+
|
| 445 |
+
# ---- Engine compatibility -----------------------------------------------
|
| 446 |
+
engine_section = ""
|
| 447 |
+
em = report.engine_match
|
| 448 |
+
if em:
|
| 449 |
+
def _fmt_flag(f) -> str: # noqa: ANN001
|
| 450 |
+
base = f"{f.flag} {f.value}".strip()
|
| 451 |
+
return base
|
| 452 |
+
flags = ", ".join(_fmt_flag(f) for f in em.required_flags) if em.required_flags else "—"
|
| 453 |
+
opt_flags = ", ".join(_fmt_flag(f) for f in em.optional_flags) if em.optional_flags else "—"
|
| 454 |
+
caveats = em.caveats_zh if is_zh else em.caveats_en
|
| 455 |
+
sources_html = "—"
|
| 456 |
+
if em.sources:
|
| 457 |
+
sources_html = "<br>".join(
|
| 458 |
+
f'<a href="{_esc(s.url)}" target="_blank" rel="noopener">{_esc(s.url)}</a>'
|
| 459 |
+
+ (
|
| 460 |
+
f" <span class='lc-prov'>({_esc(s.captured_date)})</span>"
|
| 461 |
+
if s.captured_date
|
| 462 |
+
else ""
|
| 463 |
+
)
|
| 464 |
+
for s in em.sources
|
| 465 |
+
)
|
| 466 |
+
rows = [
|
| 467 |
+
(("引擎" if is_zh else "Engine"), f"<code>{_esc(em.engine)}</code>"),
|
| 468 |
+
(
|
| 469 |
+
("版本要求" if is_zh else "Version"),
|
| 470 |
+
f"<code>{_esc(em.version_spec)}</code>",
|
| 471 |
+
),
|
| 472 |
+
(
|
| 473 |
+
("支持级别" if is_zh else "Support"),
|
| 474 |
+
_label_chip(em.support) if em.support in {"verified", "cited", "unverified"} else f"<code>{_esc(em.support)}</code>",
|
| 475 |
+
),
|
| 476 |
+
(
|
| 477 |
+
("验证级别" if is_zh else "Verification"),
|
| 478 |
+
_label_chip(em.verification_level),
|
| 479 |
+
),
|
| 480 |
+
(("必需 flag" if is_zh else "Required flags"), f"<code>{_esc(flags)}</code>"),
|
| 481 |
+
(("可选 flag" if is_zh else "Optional flags"), f"<code>{_esc(opt_flags)}</code>"),
|
| 482 |
+
]
|
| 483 |
+
if caveats:
|
| 484 |
+
rows.append((("注意事项" if is_zh else "Caveats"), _esc(caveats)))
|
| 485 |
+
rows.append((("来源" if is_zh else "Sources"), sources_html))
|
| 486 |
+
body = "".join(f"<tr><td>{k}</td><td>{v}</td></tr>" for k, v in rows)
|
| 487 |
+
engine_explainer = (
|
| 488 |
+
"这个模型在 vLLM/SGLang 哪个版本起能跑、需要哪些必需 flag、有哪些优化 flag。"
|
| 489 |
+
"verification_level 标 cited 表示从 PR / release note 引用,verified 表示实测过。"
|
| 490 |
+
if is_zh
|
| 491 |
+
else "Which engine version supports this model, what flags are required, "
|
| 492 |
+
"and which optional flags help. verification_level=cited means we got it "
|
| 493 |
+
"from a PR or release note; verified means we actually ran it."
|
| 494 |
+
)
|
| 495 |
+
engine_section = (
|
| 496 |
+
f"<div class='lc-section'>"
|
| 497 |
+
f"<h3>{'引擎兼容性' if is_zh else 'Engine compatibility'}</h3>"
|
| 498 |
+
f"<div class='lc-section-help'>{engine_explainer}</div>"
|
| 499 |
+
f"<table class='lc-table'>{body}</table></div>"
|
| 500 |
+
)
|
| 501 |
+
|
| 502 |
+
# ---- GPU spec ------------------------------------------------------------
|
| 503 |
+
gpu_section = ""
|
| 504 |
+
g = report.gpu_spec
|
| 505 |
+
if g:
|
| 506 |
+
notes = g.notes_zh if is_zh else g.notes_en
|
| 507 |
+
rows = [
|
| 508 |
+
("HBM", f"{g.memory_gb} GB"),
|
| 509 |
+
("Memory BW", f"{g.memory_bandwidth_gbps or '—'} GB/s"),
|
| 510 |
+
("NVLink BW", f"{g.nvlink_bandwidth_gbps} GB/s"),
|
| 511 |
+
("FP16 TFLOPS", f"{g.fp16_tflops}"),
|
| 512 |
+
("FP8", "✓" if g.fp8_support else "—"),
|
| 513 |
+
("FP4", "✓" if g.fp4_support else "—"),
|
| 514 |
+
]
|
| 515 |
+
rows_html = "".join(
|
| 516 |
+
f"<tr><td>{_esc(k)}</td><td><code>{_esc(v)}</code></td></tr>"
|
| 517 |
+
for k, v in rows
|
| 518 |
+
)
|
| 519 |
+
notes_html = (
|
| 520 |
+
f"<div class='lc-prov' style='margin-top:8px'>{_esc(notes)}</div>" if notes else ""
|
| 521 |
+
)
|
| 522 |
+
source_html = (
|
| 523 |
+
f"<div class='lc-prov'>{'来源' if is_zh else 'Source'}: "
|
| 524 |
+
f"<a href='{_esc(g.spec_source)}' target='_blank' rel='noopener'>"
|
| 525 |
+
f"{_esc(g.spec_source)}</a></div>"
|
| 526 |
+
if g.spec_source and g.spec_source.startswith("http")
|
| 527 |
+
else (f"<div class='lc-prov'>{_esc(g.spec_source)}</div>" if g.spec_source else "")
|
| 528 |
+
)
|
| 529 |
+
gpu_explainer = (
|
| 530 |
+
"目标 GPU 的硬件规格。Memory BW 决定 decode 能跑多快,FP8/FP4 支持决定能用什么量化。"
|
| 531 |
+
if is_zh
|
| 532 |
+
else "Hardware spec of the chosen GPU. Memory BW caps decode throughput; "
|
| 533 |
+
"FP8/FP4 support determines which quantization paths actually accelerate."
|
| 534 |
+
)
|
| 535 |
+
gpu_section = (
|
| 536 |
+
f"<div class='lc-section'>"
|
| 537 |
+
f"<h3>{'目标 GPU 规格' if is_zh else 'Target GPU spec'} — <code>{_esc(g.id)}</code></h3>"
|
| 538 |
+
f"<div class='lc-section-help'>{gpu_explainer}</div>"
|
| 539 |
+
f"<table class='lc-table'>{rows_html}</table>"
|
| 540 |
+
f"{notes_html}{source_html}"
|
| 541 |
+
f"</div>"
|
| 542 |
+
)
|
| 543 |
+
|
| 544 |
+
# ---- Generated command ---------------------------------------------------
|
| 545 |
+
cmd_section = ""
|
| 546 |
+
if report.generated_command:
|
| 547 |
+
cmd_explainer = (
|
| 548 |
+
"可以直接复制粘贴到带显卡的机器上跑。flag 是按推荐 tier 的 GPU 数 + 引擎兼容矩阵的必需 flag 自动拼的。"
|
| 549 |
+
if is_zh
|
| 550 |
+
else "Copy-pasteable on a machine with the right GPUs. Flags auto-assembled "
|
| 551 |
+
"from the recommended fleet tier + engine compat matrix's required flags."
|
| 552 |
+
)
|
| 553 |
+
cmd_section = (
|
| 554 |
+
f"<div class='lc-section'>"
|
| 555 |
+
f"<h3>{'生成命令' if is_zh else 'Generated command'}</h3>"
|
| 556 |
+
f"<div class='lc-section-help'>{cmd_explainer}</div>"
|
| 557 |
+
f"<pre class='lc-cmd'><code>{_esc(report.generated_command)}</code></pre></div>"
|
| 558 |
+
)
|
| 559 |
+
|
| 560 |
+
return (
|
| 561 |
+
"<div class='lc-result'>"
|
| 562 |
+
+ headline
|
| 563 |
+
+ arch_section
|
| 564 |
+
+ gpu_section
|
| 565 |
+
+ recon_section
|
| 566 |
+
+ kv_section
|
| 567 |
+
+ fleet_section
|
| 568 |
+
+ perf_section
|
| 569 |
+
+ engine_section
|
| 570 |
+
+ cmd_section
|
| 571 |
+
+ _render_star_cta(is_zh)
|
| 572 |
+
+ "</div>"
|
| 573 |
+
)
|
| 574 |
+
|
| 575 |
+
|
| 576 |
+
def _render_compare(reports: list[EvaluationReport], locale: str) -> str:
|
| 577 |
+
"""Side-by-side comparison of N >= 2 reports for the same model on
|
| 578 |
+
different GPUs.
|
| 579 |
+
|
| 580 |
+
Each metric column declares whether higher or lower is better and we
|
| 581 |
+
paint the winner cell in green so the eye snaps to it.
|
| 582 |
+
"""
|
| 583 |
+
set_locale(locale) # type: ignore[arg-type]
|
| 584 |
+
is_zh = locale == "zh"
|
| 585 |
+
|
| 586 |
+
# All reports share the same model_id + engine — pull from the first.
|
| 587 |
+
head = reports[0]
|
| 588 |
+
title = (
|
| 589 |
+
f"<div class='lc-header'>"
|
| 590 |
+
f"<div class='lc-title'>{_esc(head.model_id)}</div>"
|
| 591 |
+
f"<div class='lc-subtitle'>"
|
| 592 |
+
f"{('对比 ' + str(len(reports)) + ' 张 GPU') if is_zh else ('Comparing ' + str(len(reports)) + ' GPUs')}"
|
| 593 |
+
f" · {_esc(head.engine)}"
|
| 594 |
+
f"</div></div>"
|
| 595 |
+
)
|
| 596 |
+
|
| 597 |
+
# Metric definitions: (label_en, label_zh, value_fn, better=lower|higher|info, formatter)
|
| 598 |
+
# "info" rows are not contested — used for model-determined facts (same across
|
| 599 |
+
# GPUs by construction) or for descriptive cells like Bottleneck.
|
| 600 |
+
def _max_concurrent(r: EvaluationReport) -> int | None:
|
| 601 |
+
if not r.fleet:
|
| 602 |
+
return None
|
| 603 |
+
prod = next((o for o in r.fleet.options if o.tier == "prod"), None)
|
| 604 |
+
return prod.max_concurrent_at_reference_ctx if prod else None
|
| 605 |
+
|
| 606 |
+
def _prod_gpu_count(r: EvaluationReport) -> int | None:
|
| 607 |
+
if not r.fleet:
|
| 608 |
+
return None
|
| 609 |
+
prod = next((o for o in r.fleet.options if o.tier == "prod"), None)
|
| 610 |
+
return prod.gpu_count if prod else None
|
| 611 |
+
|
| 612 |
+
def _kv_per_user_128k(r: EvaluationReport) -> int | None:
|
| 613 |
+
av = r.kv_cache_by_context.get(131072)
|
| 614 |
+
return av.value if av is not None else None
|
| 615 |
+
|
| 616 |
+
def _native_precision_score(r: EvaluationReport) -> int | None:
|
| 617 |
+
g = r.gpu_spec
|
| 618 |
+
if g is None:
|
| 619 |
+
return None
|
| 620 |
+
return (1 if g.fp8_support else 0) + (1 if g.fp4_support else 0)
|
| 621 |
+
|
| 622 |
+
def _fmt_native(v: int | None) -> str:
|
| 623 |
+
if v is None:
|
| 624 |
+
return "—"
|
| 625 |
+
return {0: "FP16 only", 1: "FP8", 2: "FP8 + FP4"}.get(v, str(v))
|
| 626 |
+
|
| 627 |
+
def _max_context_tokens(r: EvaluationReport) -> int | None:
|
| 628 |
+
"""Effective max context the model claims to support.
|
| 629 |
+
|
| 630 |
+
In modern HF configs (LLaMA 3+, DeepSeek V3+, Qwen2.5+), the field
|
| 631 |
+
max_position_embeddings already reflects the post-RoPE/YaRN-scaling
|
| 632 |
+
window. rope_scaling_factor is recorded for provenance but must NOT
|
| 633 |
+
be multiplied in again — that double-counts.
|
| 634 |
+
"""
|
| 635 |
+
pos = r.profile.position
|
| 636 |
+
if pos is None or pos.max_position_embeddings is None:
|
| 637 |
+
return None
|
| 638 |
+
return int(pos.max_position_embeddings)
|
| 639 |
+
|
| 640 |
+
def _fmt_context(v: int | None) -> str:
|
| 641 |
+
"""Binary-base formatting so 131072 reads as '128K' not '131K'."""
|
| 642 |
+
if v is None:
|
| 643 |
+
return "—"
|
| 644 |
+
if v >= 1024 * 1024:
|
| 645 |
+
return f"{v / (1024 * 1024):.1f}M".replace(".0M", "M")
|
| 646 |
+
if v >= 1024:
|
| 647 |
+
return f"{v // 1024}K"
|
| 648 |
+
return str(v)
|
| 649 |
+
|
| 650 |
+
def _cluster_qps(r: EvaluationReport) -> float | None:
|
| 651 |
+
"""Steady-state queries/sec the cluster sustains:
|
| 652 |
+
QPS = cluster_decode_tokens_per_sec / output_tokens_per_request."""
|
| 653 |
+
if not r.decode or r.decode.cluster_tokens_per_sec.value <= 0:
|
| 654 |
+
return None
|
| 655 |
+
out = r.perf_output_tokens or 512
|
| 656 |
+
if out <= 0:
|
| 657 |
+
return None
|
| 658 |
+
return r.decode.cluster_tokens_per_sec.value / out
|
| 659 |
+
|
| 660 |
+
metrics = [
|
| 661 |
+
# ── Model-determined rows (info; identical across GPUs by definition) ──
|
| 662 |
+
("Quantization", "量化方案",
|
| 663 |
+
lambda r: r.weight.quantization_guess.value, "info",
|
| 664 |
+
lambda v: _esc(str(v)) if v else "—"),
|
| 665 |
+
("Weights total", "权重总量",
|
| 666 |
+
lambda r: r.weight.total_bytes.value, "info",
|
| 667 |
+
lambda v: _fmt_bytes(v) if v else "—"),
|
| 668 |
+
("KV / user @ 128K", "KV / 用户 @ 128K",
|
| 669 |
+
_kv_per_user_128k, "info",
|
| 670 |
+
lambda v: _fmt_bytes(v) if v is not None else "—"),
|
| 671 |
+
("Max context", "最大上下文",
|
| 672 |
+
_max_context_tokens, "info",
|
| 673 |
+
_fmt_context),
|
| 674 |
+
# ── GPU hardware specs (contested) ──
|
| 675 |
+
("HBM / card", "单卡显存",
|
| 676 |
+
lambda r: r.gpu_spec.memory_gb if r.gpu_spec else None, "higher",
|
| 677 |
+
lambda v: f"{v} GB" if v is not None else "—"),
|
| 678 |
+
("HBM bandwidth", "显存带宽",
|
| 679 |
+
lambda r: r.gpu_spec.memory_bandwidth_gbps if r.gpu_spec else None, "higher",
|
| 680 |
+
lambda v: f"{v:,} GB/s" if v is not None else "—"),
|
| 681 |
+
("NVLink / card", "NVLink 带宽",
|
| 682 |
+
lambda r: r.gpu_spec.nvlink_bandwidth_gbps if r.gpu_spec else None, "higher",
|
| 683 |
+
lambda v: (f"{v} GB/s" if v else "无") if v is not None else "—"),
|
| 684 |
+
("Native FP8/FP4", "原生低精度",
|
| 685 |
+
_native_precision_score, "higher",
|
| 686 |
+
_fmt_native),
|
| 687 |
+
# ── Sizing & performance outcomes (contested) ──
|
| 688 |
+
("Prod GPUs", "生产档 GPU 数",
|
| 689 |
+
_prod_gpu_count, "lower",
|
| 690 |
+
lambda v: str(v) if v is not None else "—"),
|
| 691 |
+
("Users @ 128K", "用户 @ 128K",
|
| 692 |
+
_max_concurrent, "higher",
|
| 693 |
+
lambda v: str(v) if v is not None else "—"),
|
| 694 |
+
("Prefill latency", "Prefill 延迟",
|
| 695 |
+
lambda r: r.prefill.latency_ms.value if r.prefill else None, "lower",
|
| 696 |
+
lambda v: f"{v:.0f} ms" if v is not None else "—"),
|
| 697 |
+
("Per-GPU decode", "单卡 decode 吞吐",
|
| 698 |
+
lambda r: r.decode.per_gpu_tokens_per_sec.value if r.decode else None, "higher",
|
| 699 |
+
lambda v: f"{v:.0f} tok/s" if v is not None else "—"),
|
| 700 |
+
("Cluster decode", "集群 decode 吞吐",
|
| 701 |
+
lambda r: r.decode.cluster_tokens_per_sec.value if r.decode else None, "higher",
|
| 702 |
+
lambda v: f"{v:.0f} tok/s" if v is not None else "—"),
|
| 703 |
+
("Sustained QPS", "稳态 QPS",
|
| 704 |
+
_cluster_qps, "higher",
|
| 705 |
+
lambda v: f"{v:.2f} q/s" if v is not None else "—"),
|
| 706 |
+
# ── Diagnostic (info — string, not a number race) ──
|
| 707 |
+
("Bottleneck", "瓶颈",
|
| 708 |
+
lambda r: r.concurrency.bottleneck if r.concurrency else None, "info",
|
| 709 |
+
lambda v: f"<code>{_esc(str(v))}</code>" if v else "—"),
|
| 710 |
+
]
|
| 711 |
+
|
| 712 |
+
# GPU column headers
|
| 713 |
+
gpu_headers = "".join(
|
| 714 |
+
f"<th class='lc-cmp-gpu'>{_esc(r.gpu)}</th>" for r in reports
|
| 715 |
+
)
|
| 716 |
+
|
| 717 |
+
rows_html = []
|
| 718 |
+
for label_en, label_zh, getter, better, fmt in metrics:
|
| 719 |
+
values = [getter(r) for r in reports]
|
| 720 |
+
|
| 721 |
+
# Pick the winning index. None values are excluded from the contest.
|
| 722 |
+
winner_idx: int | None = None
|
| 723 |
+
if better in ("higher", "lower"):
|
| 724 |
+
numeric_pairs = [(i, v) for i, v in enumerate(values) if isinstance(v, (int, float))]
|
| 725 |
+
if numeric_pairs:
|
| 726 |
+
if better == "higher":
|
| 727 |
+
winner_idx = max(numeric_pairs, key=lambda p: p[1])[0]
|
| 728 |
+
else:
|
| 729 |
+
winner_idx = min(numeric_pairs, key=lambda p: p[1])[0]
|
| 730 |
+
# If all values are equal, no winner (avoid arbitrary-tiebreak gold star)
|
| 731 |
+
vals_set = {v for _, v in numeric_pairs}
|
| 732 |
+
if len(vals_set) <= 1:
|
| 733 |
+
winner_idx = None
|
| 734 |
+
|
| 735 |
+
cells = []
|
| 736 |
+
for i, v in enumerate(values):
|
| 737 |
+
cls = " class='lc-cmp-winner'" if i == winner_idx else ""
|
| 738 |
+
cells.append(f"<td{cls}>{fmt(v)}</td>")
|
| 739 |
+
|
| 740 |
+
label = label_zh if is_zh else label_en
|
| 741 |
+
# Tag info rows so the eye knows "this is a model fact, not a contest".
|
| 742 |
+
is_info = better == "info"
|
| 743 |
+
label_cls = "lc-cmp-row-label lc-cmp-row-info" if is_info else "lc-cmp-row-label"
|
| 744 |
+
tr_cls = " class='lc-cmp-tr-info'" if is_info else ""
|
| 745 |
+
rows_html.append(
|
| 746 |
+
f"<tr{tr_cls}><th class='{label_cls}'>{_esc(label)}</th>{''.join(cells)}</tr>"
|
| 747 |
+
)
|
| 748 |
+
|
| 749 |
+
# Aggregate winner — count column wins across "higher/lower" metrics
|
| 750 |
+
win_counts = [0] * len(reports)
|
| 751 |
+
for label_en, label_zh, getter, better, fmt in metrics:
|
| 752 |
+
if better == "info":
|
| 753 |
+
continue
|
| 754 |
+
values = [getter(r) for r in reports]
|
| 755 |
+
numeric_pairs = [(i, v) for i, v in enumerate(values) if isinstance(v, (int, float))]
|
| 756 |
+
if not numeric_pairs:
|
| 757 |
+
continue
|
| 758 |
+
vals_set = {v for _, v in numeric_pairs}
|
| 759 |
+
if len(vals_set) <= 1:
|
| 760 |
+
continue
|
| 761 |
+
if better == "higher":
|
| 762 |
+
winner_idx = max(numeric_pairs, key=lambda p: p[1])[0]
|
| 763 |
+
else:
|
| 764 |
+
winner_idx = min(numeric_pairs, key=lambda p: p[1])[0]
|
| 765 |
+
win_counts[winner_idx] += 1
|
| 766 |
+
|
| 767 |
+
overall_text = ""
|
| 768 |
+
if any(win_counts):
|
| 769 |
+
max_wins = max(win_counts)
|
| 770 |
+
leaders = [reports[i].gpu for i, c in enumerate(win_counts) if c == max_wins]
|
| 771 |
+
if len(leaders) == 1:
|
| 772 |
+
overall_text = (
|
| 773 |
+
f"<div class='lc-cmp-summary'>"
|
| 774 |
+
f"{'综合最优' if is_zh else 'Overall winner'}: "
|
| 775 |
+
f"<strong>{_esc(leaders[0])}</strong> "
|
| 776 |
+
f"({max_wins}/{sum(1 for m in metrics if m[3] != 'info')} "
|
| 777 |
+
f"{'指标领先' if is_zh else 'metrics lead'})"
|
| 778 |
+
f"</div>"
|
| 779 |
+
)
|
| 780 |
+
else:
|
| 781 |
+
overall_text = (
|
| 782 |
+
f"<div class='lc-cmp-summary'>"
|
| 783 |
+
f"{'势均力敌' if is_zh else 'Tied'}: "
|
| 784 |
+
f"<strong>{_esc(' / '.join(leaders))}</strong>"
|
| 785 |
+
f"</div>"
|
| 786 |
+
)
|
| 787 |
+
|
| 788 |
+
table = (
|
| 789 |
+
f"<div class='lc-section'>"
|
| 790 |
+
f"<h3>{'对比' if is_zh else 'Side-by-side comparison'}</h3>"
|
| 791 |
+
f"<div class='lc-cmp-wrap'>"
|
| 792 |
+
f"<table class='lc-cmp-table'>"
|
| 793 |
+
f"<thead><tr>"
|
| 794 |
+
f"<th class='lc-cmp-row-label'></th>"
|
| 795 |
+
f"{gpu_headers}"
|
| 796 |
+
f"</tr></thead>"
|
| 797 |
+
f"<tbody>{''.join(rows_html)}</tbody>"
|
| 798 |
+
f"</table></div>"
|
| 799 |
+
f"{overall_text}"
|
| 800 |
+
f"</div>"
|
| 801 |
+
)
|
| 802 |
+
|
| 803 |
+
# Per-GPU detail headlines (small stat cards) below the table
|
| 804 |
+
detail_blocks = []
|
| 805 |
+
for r in reports:
|
| 806 |
+
weight_str = _fmt_bytes(r.weight.total_bytes.value)
|
| 807 |
+
prod = _prod_gpu_count(r)
|
| 808 |
+
users = _max_concurrent(r)
|
| 809 |
+
detail_blocks.append(
|
| 810 |
+
f"<div class='lc-cmp-detail'>"
|
| 811 |
+
f"<div class='lc-cmp-detail-gpu'>{_esc(r.gpu)}</div>"
|
| 812 |
+
f"<div class='lc-cmp-detail-row'>"
|
| 813 |
+
f"<span>{'权重' if is_zh else 'Weight'}</span><strong>{weight_str}</strong></div>"
|
| 814 |
+
f"<div class='lc-cmp-detail-row'>"
|
| 815 |
+
f"<span>{'生产 GPU' if is_zh else 'Prod GPUs'}</span>"
|
| 816 |
+
f"<strong>{prod if prod is not None else '—'}</strong></div>"
|
| 817 |
+
f"<div class='lc-cmp-detail-row'>"
|
| 818 |
+
f"<span>{'用户 @ 128K' if is_zh else 'Users @ 128K'}</span>"
|
| 819 |
+
f"<strong>{users if users is not None else '—'}</strong></div>"
|
| 820 |
+
f"</div>"
|
| 821 |
+
)
|
| 822 |
+
detail_section = (
|
| 823 |
+
f"<div class='lc-section'>"
|
| 824 |
+
f"<h3>{'各档详情' if is_zh else 'Per-GPU detail'}</h3>"
|
| 825 |
+
f"<div class='lc-cmp-details'>{''.join(detail_blocks)}</div>"
|
| 826 |
+
f"</div>"
|
| 827 |
+
)
|
| 828 |
+
|
| 829 |
+
return (
|
| 830 |
+
"<div class='lc-result'>"
|
| 831 |
+
+ title
|
| 832 |
+
+ table
|
| 833 |
+
+ detail_section
|
| 834 |
+
+ _render_star_cta(is_zh)
|
| 835 |
+
+ "</div>"
|
| 836 |
+
)
|
| 837 |
+
|
| 838 |
+
|
| 839 |
+
def _render_star_cta(is_zh: bool) -> str:
|
| 840 |
+
"""Tail-of-result CTA — shown right after the user got their answer,
|
| 841 |
+
which is when satisfaction is highest and the GitHub star ask reads as
|
| 842 |
+
'thanks for the tool' rather than 'please give me attention'."""
|
| 843 |
+
en_msg = "Saved you GPU-sizing math?"
|
| 844 |
+
zh_msg = "省了你 GPU 选型的时间?"
|
| 845 |
+
cta_en = "Star on GitHub"
|
| 846 |
+
cta_zh = "给个 Star"
|
| 847 |
+
text_top = zh_msg if is_zh else en_msg
|
| 848 |
+
text_bottom = en_msg if is_zh else zh_msg
|
| 849 |
+
cta = f"{cta_zh if is_zh else cta_en} · {cta_en if is_zh else cta_zh}"
|
| 850 |
+
return (
|
| 851 |
+
"<a class='lc-star-cta' href='https://github.com/FlyTOmeLight/llm-cal' "
|
| 852 |
+
"target='_blank' rel='noopener'>"
|
| 853 |
+
"<svg viewBox='0 0 16 16' width='18' height='18' aria-hidden='true' fill='currentColor'>"
|
| 854 |
+
"<path d='M8 0C3.58 0 0 3.58 0 8a8 8 0 0 0 5.47 7.59c.4.07.55-.17.55-.38v-1.33c-2.22.48-2.69-1.07-2.69-1.07-.36-.92-.89-1.17-.89-1.17-.73-.5.06-.49.06-.49.81.06 1.23.83 1.23.83.72 1.23 1.88.87 2.34.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.83-2.15-.08-.2-.36-1.02.08-2.13 0 0 .67-.21 2.2.82a7.6 7.6 0 0 1 4 0c1.53-1.04 2.2-.82 2.2-.82.44 1.11.16 1.93.08 2.13.51.56.83 1.27.83 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48v2.19c0 .21.15.46.55.38A8 8 0 0 0 16 8c0-4.42-3.58-8-8-8z'/>"
|
| 855 |
+
"</svg>"
|
| 856 |
+
f"<div class='lc-star-cta-text'>"
|
| 857 |
+
f"<div class='lc-star-cta-q'>{text_top}</div>"
|
| 858 |
+
f"<div class='lc-star-cta-q-en'>{text_bottom}</div>"
|
| 859 |
+
f"</div>"
|
| 860 |
+
f"<div class='lc-star-cta-action'>{cta} →</div>"
|
| 861 |
+
"</a>"
|
| 862 |
+
)
|
| 863 |
+
|
| 864 |
+
|
| 865 |
+
def _render_explain(entries: list[ExplainEntry], is_zh: bool) -> str:
|
| 866 |
+
"""Render --explain derivation trace as an HTML accordion."""
|
| 867 |
+
if not entries:
|
| 868 |
+
return ""
|
| 869 |
+
blocks = []
|
| 870 |
+
for e in entries:
|
| 871 |
+
inputs_html = ""
|
| 872 |
+
if e.inputs:
|
| 873 |
+
inputs_html = "<ul class='lc-explain-inputs'>" + "".join(
|
| 874 |
+
f"<li><code>{_esc(inp.name)}</code> = "
|
| 875 |
+
f"<strong>{_esc(inp.value)}</strong> "
|
| 876 |
+
f"<span class='lc-explain-label'>{_esc(inp.label)}</span>"
|
| 877 |
+
+ (f" — <em>{_esc(inp.note)}</em>" if inp.note else "")
|
| 878 |
+
+ "</li>"
|
| 879 |
+
for inp in e.inputs
|
| 880 |
+
) + "</ul>"
|
| 881 |
+
steps_html = ""
|
| 882 |
+
if e.steps:
|
| 883 |
+
steps_html = "<ol class='lc-explain-steps'>" + "".join(
|
| 884 |
+
f"<li>{_esc(s)}</li>" for s in e.steps
|
| 885 |
+
) + "</ol>"
|
| 886 |
+
source_html = (
|
| 887 |
+
f"<div class='lc-prov'>{'来源' if is_zh else 'Source'}: {_esc(e.source)}</div>"
|
| 888 |
+
if e.source
|
| 889 |
+
else ""
|
| 890 |
+
)
|
| 891 |
+
blocks.append(
|
| 892 |
+
f"<div class='lc-explain-entry'>"
|
| 893 |
+
f"<div class='lc-explain-heading'>{_esc(e.heading)}</div>"
|
| 894 |
+
f"<div class='lc-explain-formula'><code>{_esc(e.formula)}</code></div>"
|
| 895 |
+
f"{inputs_html}{steps_html}"
|
| 896 |
+
f"<div class='lc-explain-result'>"
|
| 897 |
+
f"{'结果' if is_zh else 'Result'}: <strong>{_esc(e.result)}</strong></div>"
|
| 898 |
+
f"{source_html}"
|
| 899 |
+
f"</div>"
|
| 900 |
+
)
|
| 901 |
+
return (
|
| 902 |
+
"<div class='lc-result'>"
|
| 903 |
+
f"<div class='lc-section'>"
|
| 904 |
+
f"<h3>{'推导链 (--explain)' if is_zh else 'Derivation trace (--explain)'}</h3>"
|
| 905 |
+
+ "".join(blocks)
|
| 906 |
+
+ "</div></div>"
|
| 907 |
+
)
|
| 908 |
+
|
| 909 |
+
|
| 910 |
+
def _render_llm_review(content: str | None, error: str | None, model: str, is_zh: bool) -> str:
|
| 911 |
+
if error:
|
| 912 |
+
return _render_error(f"LLM review: {error}", is_zh)
|
| 913 |
+
if not content:
|
| 914 |
+
return ""
|
| 915 |
+
# The LLM responds with markdown — convert to a simple HTML block for display.
|
| 916 |
+
# gr.HTML doesn't run markdown, but the LLM's headers (## ...) still read OK as text.
|
| 917 |
+
safe = _esc(content).replace("\n", "<br>")
|
| 918 |
+
return (
|
| 919 |
+
"<div class='lc-result'>"
|
| 920 |
+
f"<div class='lc-section'>"
|
| 921 |
+
f"<h3>{'LLM 审计 (--llm-review)' if is_zh else 'LLM review (--llm-review)'} "
|
| 922 |
+
f"<span class='lc-llm-model'>{_esc(model)}</span></h3>"
|
| 923 |
+
f"<div class='lc-llm-banner'>"
|
| 924 |
+
f"{_label_chip('llm-opinion')} "
|
| 925 |
+
f"{'仅供参考,不覆盖前 6 个 label' if is_zh else 'Second opinion — never overrides the 6 primary labels'}"
|
| 926 |
+
f"</div>"
|
| 927 |
+
f"<div class='lc-llm-content'>{safe}</div>"
|
| 928 |
+
f"</div></div>"
|
| 929 |
+
)
|
| 930 |
+
|
| 931 |
+
|
| 932 |
+
def _render_error(msg: str, is_zh: bool) -> str:
|
| 933 |
+
label = "出错了" if is_zh else "Error"
|
| 934 |
+
return (
|
| 935 |
+
f"<div class='lc-result lc-error'>"
|
| 936 |
+
f"<h3>{label}</h3>"
|
| 937 |
+
f"<pre>{_esc(msg)}</pre></div>"
|
| 938 |
+
)
|
| 939 |
+
|
| 940 |
+
|
| 941 |
+
def _render_loading(is_zh: bool) -> str:
|
| 942 |
+
msg = (
|
| 943 |
+
"正在拉取模型元数据 + 读 safetensors header… 首次大模型约 3-8 秒"
|
| 944 |
+
if is_zh
|
| 945 |
+
else "Fetching model metadata + reading safetensors header… "
|
| 946 |
+
"first lookup of a large model takes 3-8 seconds"
|
| 947 |
+
)
|
| 948 |
+
return (
|
| 949 |
+
"<div class='lc-result lc-loading'>"
|
| 950 |
+
"<div class='lc-spinner'></div>"
|
| 951 |
+
f"<div class='lc-loading-text'>{msg}</div>"
|
| 952 |
+
"</div>"
|
| 953 |
+
)
|
| 954 |
+
|
| 955 |
+
|
| 956 |
+
# ---------------------------------------------------------------------------
|
| 957 |
+
# Backend handler
|
| 958 |
+
|
| 959 |
+
_evaluators: dict[str, Evaluator] = {}
|
| 960 |
+
|
| 961 |
+
|
| 962 |
+
def _get_evaluator(source_key: str) -> Evaluator:
|
| 963 |
+
"""One evaluator per source — Evaluator caches an HfApi client internally
|
| 964 |
+
so we don't want to rebuild it every keystroke."""
|
| 965 |
+
if source_key not in _evaluators:
|
| 966 |
+
if source_key == "modelscope":
|
| 967 |
+
_evaluators[source_key] = Evaluator(source=ModelScopeSource())
|
| 968 |
+
else:
|
| 969 |
+
_evaluators[source_key] = Evaluator(source=HuggingFaceSource())
|
| 970 |
+
return _evaluators[source_key]
|
| 971 |
+
|
| 972 |
+
|
| 973 |
+
def calculate(
|
| 974 |
+
model_id: str,
|
| 975 |
+
gpu, # list[str] from multiselect; str also tolerated # noqa: ANN001
|
| 976 |
+
engine: str,
|
| 977 |
+
context_length: int | None,
|
| 978 |
+
lang: str,
|
| 979 |
+
source: str,
|
| 980 |
+
gpu_count: int | None,
|
| 981 |
+
input_tokens: int,
|
| 982 |
+
output_tokens: int,
|
| 983 |
+
target_tps: float,
|
| 984 |
+
prefill_util: float,
|
| 985 |
+
decode_bw_util: float,
|
| 986 |
+
concurrency_degradation: float,
|
| 987 |
+
refresh: bool,
|
| 988 |
+
explain: bool,
|
| 989 |
+
llm_review: bool,
|
| 990 |
+
hf_token: str,
|
| 991 |
+
ms_token: str,
|
| 992 |
+
llm_api_key: str,
|
| 993 |
+
llm_base_url: str,
|
| 994 |
+
llm_model: str,
|
| 995 |
+
) -> tuple[str, str, str]:
|
| 996 |
+
"""Returns (main_html, explain_html, llm_review_html)."""
|
| 997 |
+
locale = "zh" if lang.startswith("中") else "en"
|
| 998 |
+
is_zh = locale == "zh"
|
| 999 |
+
|
| 1000 |
+
# Normalize GPU input. Multiselect returns list; defensive coerce for safety.
|
| 1001 |
+
if isinstance(gpu, str):
|
| 1002 |
+
gpu_list = [gpu] if gpu else []
|
| 1003 |
+
elif isinstance(gpu, (list, tuple)):
|
| 1004 |
+
gpu_list = [g for g in gpu if g]
|
| 1005 |
+
else:
|
| 1006 |
+
gpu_list = []
|
| 1007 |
+
|
| 1008 |
+
if not model_id or not model_id.strip():
|
| 1009 |
+
return (
|
| 1010 |
+
_render_error(
|
| 1011 |
+
"请输入模型 ID" if is_zh else "Enter a model id",
|
| 1012 |
+
is_zh,
|
| 1013 |
+
),
|
| 1014 |
+
"",
|
| 1015 |
+
"",
|
| 1016 |
+
)
|
| 1017 |
+
if not gpu_list:
|
| 1018 |
+
return (_render_error("请选择 GPU" if is_zh else "Pick a GPU", is_zh), "", "")
|
| 1019 |
+
|
| 1020 |
+
is_compare = len(gpu_list) >= 2
|
| 1021 |
+
|
| 1022 |
+
# Resolve source key. The radio shows e.g. "HuggingFace" / "ModelScope".
|
| 1023 |
+
src_key = "modelscope" if "modelscope" in source.lower() else "huggingface"
|
| 1024 |
+
|
| 1025 |
+
# Inject user-provided tokens into env for the duration of this call only.
|
| 1026 |
+
# We restore the prior values in the finally block so a token entered for
|
| 1027 |
+
# one model doesn't leak into the next request from a different user.
|
| 1028 |
+
token_env_keys = (
|
| 1029 |
+
"HF_TOKEN",
|
| 1030 |
+
"HUGGING_FACE_HUB_TOKEN",
|
| 1031 |
+
"MODELSCOPE_API_TOKEN",
|
| 1032 |
+
"MODELSCOPE_TOKEN",
|
| 1033 |
+
)
|
| 1034 |
+
old_token_env = {k: os.environ.get(k) for k in token_env_keys}
|
| 1035 |
+
if hf_token and hf_token.strip():
|
| 1036 |
+
os.environ["HF_TOKEN"] = hf_token.strip()
|
| 1037 |
+
if ms_token and ms_token.strip():
|
| 1038 |
+
os.environ["MODELSCOPE_API_TOKEN"] = ms_token.strip()
|
| 1039 |
+
|
| 1040 |
+
def _eval_one(g: str) -> EvaluationReport:
|
| 1041 |
+
return _get_evaluator(src_key).evaluate(
|
| 1042 |
+
model_id=model_id.strip(),
|
| 1043 |
+
gpu=g,
|
| 1044 |
+
engine=engine,
|
| 1045 |
+
gpu_count=gpu_count if gpu_count and gpu_count > 0 else None,
|
| 1046 |
+
context_length=context_length if context_length and context_length > 0 else None,
|
| 1047 |
+
refresh=refresh,
|
| 1048 |
+
input_tokens=int(input_tokens) if input_tokens else 2000,
|
| 1049 |
+
output_tokens=int(output_tokens) if output_tokens else 512,
|
| 1050 |
+
target_tokens_per_sec=float(target_tps) if target_tps else 30.0,
|
| 1051 |
+
prefill_utilization=float(prefill_util) if prefill_util else 0.40,
|
| 1052 |
+
decode_bw_utilization=float(decode_bw_util) if decode_bw_util else 0.50,
|
| 1053 |
+
concurrency_degradation=(
|
| 1054 |
+
float(concurrency_degradation) if concurrency_degradation else 1.0
|
| 1055 |
+
),
|
| 1056 |
+
)
|
| 1057 |
+
|
| 1058 |
+
try:
|
| 1059 |
+
# ---- Compare path: 2-4 GPUs --------------------------------------
|
| 1060 |
+
if is_compare:
|
| 1061 |
+
try:
|
| 1062 |
+
reports = [_eval_one(g) for g in gpu_list]
|
| 1063 |
+
except Exception as e: # noqa: BLE001
|
| 1064 |
+
return (_render_error(f"{type(e).__name__}: {e}", is_zh), "", "")
|
| 1065 |
+
return _render_compare(reports, locale), "", ""
|
| 1066 |
+
|
| 1067 |
+
# ---- Single-GPU path (existing flow) ------------------------------
|
| 1068 |
+
try:
|
| 1069 |
+
report = _eval_one(gpu_list[0])
|
| 1070 |
+
except Exception as e: # noqa: BLE001
|
| 1071 |
+
return (_render_error(f"{type(e).__name__}: {e}", is_zh), "", "")
|
| 1072 |
+
|
| 1073 |
+
main_html = _render(report, locale)
|
| 1074 |
+
explain_html = ""
|
| 1075 |
+
llm_html = ""
|
| 1076 |
+
|
| 1077 |
+
if explain or llm_review:
|
| 1078 |
+
entries = build_explain(report)
|
| 1079 |
+
if explain:
|
| 1080 |
+
explain_html = _render_explain(entries, is_zh)
|
| 1081 |
+
if llm_review:
|
| 1082 |
+
# Only set env vars if user actually provided them — never persist
|
| 1083 |
+
# them in env beyond this call's scope (they live in process env
|
| 1084 |
+
# for the duration of the call, but we don't persist to disk).
|
| 1085 |
+
old_env = {
|
| 1086 |
+
"LLM_CAL_REVIEWER_API_KEY": os.environ.get("LLM_CAL_REVIEWER_API_KEY"),
|
| 1087 |
+
"LLM_CAL_REVIEWER_BASE_URL": os.environ.get("LLM_CAL_REVIEWER_BASE_URL"),
|
| 1088 |
+
"LLM_CAL_REVIEWER_MODEL": os.environ.get("LLM_CAL_REVIEWER_MODEL"),
|
| 1089 |
+
}
|
| 1090 |
+
try:
|
| 1091 |
+
if llm_api_key.strip():
|
| 1092 |
+
os.environ["LLM_CAL_REVIEWER_API_KEY"] = llm_api_key.strip()
|
| 1093 |
+
if llm_base_url.strip():
|
| 1094 |
+
os.environ["LLM_CAL_REVIEWER_BASE_URL"] = llm_base_url.strip()
|
| 1095 |
+
if llm_model.strip():
|
| 1096 |
+
os.environ["LLM_CAL_REVIEWER_MODEL"] = llm_model.strip()
|
| 1097 |
+
result = run_review(entries, locale=locale) # type: ignore[arg-type]
|
| 1098 |
+
finally:
|
| 1099 |
+
for k, v in old_env.items():
|
| 1100 |
+
if v is None:
|
| 1101 |
+
os.environ.pop(k, None)
|
| 1102 |
+
else:
|
| 1103 |
+
os.environ[k] = v
|
| 1104 |
+
llm_html = _render_llm_review(result.content, result.error, result.model, is_zh)
|
| 1105 |
+
|
| 1106 |
+
return main_html, explain_html, llm_html
|
| 1107 |
+
finally:
|
| 1108 |
+
for k, v in old_token_env.items():
|
| 1109 |
+
if v is None:
|
| 1110 |
+
os.environ.pop(k, None)
|
| 1111 |
+
else:
|
| 1112 |
+
os.environ[k] = v
|
| 1113 |
+
|
| 1114 |
+
|
| 1115 |
+
def show_loading(lang: str) -> tuple[str, str, str]:
|
| 1116 |
+
is_zh = lang.startswith("中")
|
| 1117 |
+
return _render_loading(is_zh), "", ""
|
| 1118 |
+
|
| 1119 |
+
|
| 1120 |
+
# ---------------------------------------------------------------------------
|
| 1121 |
+
# UI
|
| 1122 |
+
|
| 1123 |
+
THEME = gr.themes.Soft(primary_hue="indigo")
|
| 1124 |
+
|
| 1125 |
+
HERO_HTML = """
|
| 1126 |
+
<div class='lc-hero'>
|
| 1127 |
+
<div class='lc-hero-top'>
|
| 1128 |
+
<div class='lc-hero-titleblock'>
|
| 1129 |
+
<div class='lc-hero-title'>llm-cal</div>
|
| 1130 |
+
<div class='lc-hero-tagline'>
|
| 1131 |
+
LLM inference hardware calculator · 大模型推理硬件计算器<br>
|
| 1132 |
+
Architecture-aware · Engine-aware · <strong>Honest-labeled</strong>
|
| 1133 |
+
</div>
|
| 1134 |
+
</div>
|
| 1135 |
+
<a class='lc-hero-gh' href='https://github.com/FlyTOmeLight/llm-cal' target='_blank' rel='noopener'>
|
| 1136 |
+
<svg viewBox='0 0 16 16' width='16' height='16' aria-hidden='true' fill='currentColor'>
|
| 1137 |
+
<path d='M8 0C3.58 0 0 3.58 0 8a8 8 0 0 0 5.47 7.59c.4.07.55-.17.55-.38v-1.33c-2.22.48-2.69-1.07-2.69-1.07-.36-.92-.89-1.17-.89-1.17-.73-.5.06-.49.06-.49.81.06 1.23.83 1.23.83.72 1.23 1.88.87 2.34.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.83-2.15-.08-.2-.36-1.02.08-2.13 0 0 .67-.21 2.2.82a7.6 7.6 0 0 1 4 0c1.53-1.04 2.2-.82 2.2-.82.44 1.11.16 1.93.08 2.13.51.56.83 1.27.83 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48v2.19c0 .21.15.46.55.38A8 8 0 0 0 16 8c0-4.42-3.58-8-8-8z'/>
|
| 1138 |
+
</svg>
|
| 1139 |
+
<span class='lc-hero-gh-text'>GitHub</span>
|
| 1140 |
+
<img class='lc-hero-gh-stars' alt='stars'
|
| 1141 |
+
src='https://img.shields.io/github/stars/FlyTOmeLight/llm-cal?style=flat-square&logo=&label=&color=eef2ff&labelColor=eef2ff'
|
| 1142 |
+
loading='lazy' />
|
| 1143 |
+
</a>
|
| 1144 |
+
</div>
|
| 1145 |
+
<div class='lc-hero-pitch'>
|
| 1146 |
+
<div class='lc-pitch-card lc-pitch-bad'>
|
| 1147 |
+
<div class='lc-pitch-tool'>gpu_poor</div>
|
| 1148 |
+
<div class='lc-pitch-num-bad'>284 GB</div>
|
| 1149 |
+
<div class='lc-pitch-method'>assumes pure FP8 · 假设纯 FP8</div>
|
| 1150 |
+
</div>
|
| 1151 |
+
<div class='lc-pitch-arrow'>→</div>
|
| 1152 |
+
<div class='lc-pitch-card lc-pitch-good'>
|
| 1153 |
+
<div class='lc-pitch-tool'>llm-cal</div>
|
| 1154 |
+
<div class='lc-pitch-num-good'>160 GB</div>
|
| 1155 |
+
<div class='lc-pitch-method'>reads real safetensors bytes · 读真实字节</div>
|
| 1156 |
+
</div>
|
| 1157 |
+
<div class='lc-pitch-summary'>
|
| 1158 |
+
<div class='lc-pitch-model'>DeepSeek-V4-Flash · H800</div>
|
| 1159 |
+
<div class='lc-pitch-result'>0.2% error vs 45% · 误差 0.2% vs 45%</div>
|
| 1160 |
+
</div>
|
| 1161 |
+
</div>
|
| 1162 |
+
</div>
|
| 1163 |
+
"""
|
| 1164 |
+
|
| 1165 |
+
|
| 1166 |
+
CUSTOM_CSS = """
|
| 1167 |
+
/* Font stack — system fonts in both English + Chinese, no Gradio default serif */
|
| 1168 |
+
* {
|
| 1169 |
+
font-family: -apple-system, BlinkMacSystemFont, "Inter", "Helvetica Neue",
|
| 1170 |
+
"PingFang SC", "Microsoft YaHei", "Segoe UI", Roboto, Arial, sans-serif !important;
|
| 1171 |
+
}
|
| 1172 |
+
|
| 1173 |
+
/* Hide Gradio's default footer chrome that looks like part of our app */
|
| 1174 |
+
footer { display: none !important; }
|
| 1175 |
+
.show-api, .built-with, .settings { display: none !important; }
|
| 1176 |
+
|
| 1177 |
+
/* Tighter overall padding + center on wide screens — without margin:auto the
|
| 1178 |
+
container left-aligns and leaves ~800px empty on 1920+ displays.
|
| 1179 |
+
width:100% makes it shrink to viewport when narrower than max-width
|
| 1180 |
+
(otherwise on mobile align-items:stretch + max-width overflows). */
|
| 1181 |
+
.gradio-container {
|
| 1182 |
+
max-width: 1100px !important;
|
| 1183 |
+
width: 100% !important;
|
| 1184 |
+
margin-left: auto !important;
|
| 1185 |
+
margin-right: auto !important;
|
| 1186 |
+
}
|
| 1187 |
+
|
| 1188 |
+
/* Hero section */
|
| 1189 |
+
.lc-hero {
|
| 1190 |
+
margin: 8px 0 24px 0;
|
| 1191 |
+
padding: 24px 0 18px 0;
|
| 1192 |
+
border-bottom: 1px solid #e5e7eb;
|
| 1193 |
+
}
|
| 1194 |
+
.dark .lc-hero { border-bottom-color: #374151; }
|
| 1195 |
+
|
| 1196 |
+
/* Top row: title block (left) + GitHub link (right). On mobile the GH link
|
| 1197 |
+
wraps to its own line above or below the title — order kept so it stays
|
| 1198 |
+
visible above the fold. */
|
| 1199 |
+
.lc-hero-top {
|
| 1200 |
+
display: flex;
|
| 1201 |
+
align-items: flex-start;
|
| 1202 |
+
justify-content: space-between;
|
| 1203 |
+
gap: 16px;
|
| 1204 |
+
flex-wrap: wrap;
|
| 1205 |
+
margin-bottom: 14px;
|
| 1206 |
+
}
|
| 1207 |
+
.lc-hero-titleblock {
|
| 1208 |
+
flex: 1 1 320px;
|
| 1209 |
+
min-width: 0;
|
| 1210 |
+
}
|
| 1211 |
+
.lc-hero-gh {
|
| 1212 |
+
display: inline-flex;
|
| 1213 |
+
align-items: center;
|
| 1214 |
+
gap: 8px;
|
| 1215 |
+
padding: 6px 12px;
|
| 1216 |
+
border: 1px solid #c7d2fe;
|
| 1217 |
+
background: #eef2ff;
|
| 1218 |
+
border-radius: 999px;
|
| 1219 |
+
font-size: 13px !important;
|
| 1220 |
+
font-weight: 600 !important;
|
| 1221 |
+
color: #4338ca !important;
|
| 1222 |
+
text-decoration: none !important;
|
| 1223 |
+
white-space: nowrap;
|
| 1224 |
+
transition: background 0.15s ease, border-color 0.15s ease;
|
| 1225 |
+
flex: 0 0 auto;
|
| 1226 |
+
}
|
| 1227 |
+
.lc-hero-gh:hover {
|
| 1228 |
+
background: #e0e7ff;
|
| 1229 |
+
border-color: #a5b4fc;
|
| 1230 |
+
}
|
| 1231 |
+
.dark .lc-hero-gh {
|
| 1232 |
+
background: #1e1b4b;
|
| 1233 |
+
border-color: #3730a3;
|
| 1234 |
+
color: #c7d2fe !important;
|
| 1235 |
+
}
|
| 1236 |
+
.dark .lc-hero-gh:hover { background: #312e81; border-color: #4338ca; }
|
| 1237 |
+
.lc-hero-gh svg { display: block; }
|
| 1238 |
+
.lc-hero-gh-stars {
|
| 1239 |
+
height: 18px;
|
| 1240 |
+
vertical-align: middle;
|
| 1241 |
+
border-radius: 4px;
|
| 1242 |
+
}
|
| 1243 |
+
|
| 1244 |
+
.lc-hero-title {
|
| 1245 |
+
font-size: 32px !important;
|
| 1246 |
+
font-weight: 800 !important;
|
| 1247 |
+
letter-spacing: -0.02em;
|
| 1248 |
+
color: #0f172a !important;
|
| 1249 |
+
margin: 0 !important;
|
| 1250 |
+
line-height: 1.15;
|
| 1251 |
+
}
|
| 1252 |
+
.dark .lc-hero-title { color: #f8fafc !important; }
|
| 1253 |
+
.lc-hero-tagline {
|
| 1254 |
+
font-size: 16px !important;
|
| 1255 |
+
color: #6b7280 !important;
|
| 1256 |
+
margin: 6px 0 16px 0;
|
| 1257 |
+
line-height: 1.5;
|
| 1258 |
+
}
|
| 1259 |
+
.lc-hero-pitch {
|
| 1260 |
+
display: grid;
|
| 1261 |
+
/* 4 cells: bad-card / arrow / good-card / summary on wide screens */
|
| 1262 |
+
grid-template-columns: 1fr 30px 1fr 1.2fr;
|
| 1263 |
+
gap: 14px;
|
| 1264 |
+
align-items: stretch;
|
| 1265 |
+
padding: 0;
|
| 1266 |
+
font-size: 13px !important;
|
| 1267 |
+
color: #1e293b !important;
|
| 1268 |
+
}
|
| 1269 |
+
.dark .lc-hero-pitch { color: #f1f5f9 !important; }
|
| 1270 |
+
|
| 1271 |
+
/* Tablet: bad / arrow / good in row 1, summary full-width row 2 */
|
| 1272 |
+
@media (max-width: 900px) {
|
| 1273 |
+
.lc-hero-pitch {
|
| 1274 |
+
grid-template-columns: 1fr 28px 1fr;
|
| 1275 |
+
grid-template-rows: auto auto;
|
| 1276 |
+
}
|
| 1277 |
+
.lc-pitch-summary { grid-column: 1 / -1; }
|
| 1278 |
+
}
|
| 1279 |
+
|
| 1280 |
+
/* Mobile: stack everything, hide the arrow */
|
| 1281 |
+
@media (max-width: 540px) {
|
| 1282 |
+
.lc-hero-pitch {
|
| 1283 |
+
grid-template-columns: 1fr;
|
| 1284 |
+
grid-template-rows: repeat(3, auto);
|
| 1285 |
+
}
|
| 1286 |
+
.lc-pitch-arrow { display: none; }
|
| 1287 |
+
.lc-pitch-summary { grid-column: auto; }
|
| 1288 |
+
}
|
| 1289 |
+
|
| 1290 |
+
.lc-pitch-card {
|
| 1291 |
+
padding: 14px 18px;
|
| 1292 |
+
border-radius: 10px;
|
| 1293 |
+
border: 1px solid #e5e7eb;
|
| 1294 |
+
background: #ffffff;
|
| 1295 |
+
display: flex;
|
| 1296 |
+
flex-direction: column;
|
| 1297 |
+
justify-content: center;
|
| 1298 |
+
min-width: 0;
|
| 1299 |
+
}
|
| 1300 |
+
.dark .lc-pitch-card { background: #111827; border-color: #374151; }
|
| 1301 |
+
/* Subtle accent bar on the left, not a screaming red/green border */
|
| 1302 |
+
.lc-pitch-bad { border-left: 3px solid #cbd5e1; }
|
| 1303 |
+
.lc-pitch-good { border-left: 3px solid #4f46e5; }
|
| 1304 |
+
.dark .lc-pitch-bad { border-left-color: #475569; }
|
| 1305 |
+
.dark .lc-pitch-good { border-left-color: #818cf8; }
|
| 1306 |
+
|
| 1307 |
+
.lc-pitch-tool {
|
| 1308 |
+
font-size: 12px !important;
|
| 1309 |
+
font-weight: 600 !important;
|
| 1310 |
+
color: #6b7280 !important;
|
| 1311 |
+
font-family: "SF Mono", "JetBrains Mono", Menlo, monospace !important;
|
| 1312 |
+
margin-bottom: 4px;
|
| 1313 |
+
}
|
| 1314 |
+
.lc-pitch-num-bad { font-size: 24px !important; font-weight: 800 !important; color: #b91c1c !important; line-height: 1.1; letter-spacing: -0.01em; }
|
| 1315 |
+
.lc-pitch-num-good { font-size: 24px !important; font-weight: 800 !important; color: #15803d !important; line-height: 1.1; letter-spacing: -0.01em; }
|
| 1316 |
+
.dark .lc-pitch-num-bad { color: #f87171 !important; }
|
| 1317 |
+
.dark .lc-pitch-num-good { color: #4ade80 !important; }
|
| 1318 |
+
.lc-pitch-method {
|
| 1319 |
+
font-size: 11px !important;
|
| 1320 |
+
color: #6b7280 !important;
|
| 1321 |
+
margin-top: 6px;
|
| 1322 |
+
line-height: 1.4;
|
| 1323 |
+
}
|
| 1324 |
+
|
| 1325 |
+
.lc-pitch-arrow {
|
| 1326 |
+
display: flex;
|
| 1327 |
+
align-items: center;
|
| 1328 |
+
font-size: 22px !important;
|
| 1329 |
+
color: #9ca3af !important;
|
| 1330 |
+
font-weight: 300;
|
| 1331 |
+
}
|
| 1332 |
+
|
| 1333 |
+
.lc-pitch-summary {
|
| 1334 |
+
flex: 1 1 200px;
|
| 1335 |
+
padding: 14px 18px;
|
| 1336 |
+
border-radius: 10px;
|
| 1337 |
+
background: #eef2ff;
|
| 1338 |
+
border: 1px solid #c7d2fe;
|
| 1339 |
+
display: flex;
|
| 1340 |
+
flex-direction: column;
|
| 1341 |
+
justify-content: center;
|
| 1342 |
+
}
|
| 1343 |
+
.dark .lc-pitch-summary { background: #1e1b4b; border-color: #3730a3; }
|
| 1344 |
+
.lc-pitch-model {
|
| 1345 |
+
font-size: 11px !important;
|
| 1346 |
+
font-weight: 600 !important;
|
| 1347 |
+
text-transform: uppercase;
|
| 1348 |
+
letter-spacing: 0.06em;
|
| 1349 |
+
color: #6366f1 !important;
|
| 1350 |
+
margin-bottom: 4px;
|
| 1351 |
+
}
|
| 1352 |
+
.dark .lc-pitch-model { color: #a5b4fc !important; }
|
| 1353 |
+
.lc-pitch-result {
|
| 1354 |
+
font-size: 14px !important;
|
| 1355 |
+
font-weight: 700 !important;
|
| 1356 |
+
color: #312e81 !important;
|
| 1357 |
+
}
|
| 1358 |
+
.dark .lc-pitch-result { color: #e0e7ff !important; }
|
| 1359 |
+
|
| 1360 |
+
/* Primary button — match the indigo theme; constrain width so it's not a billboard */
|
| 1361 |
+
button.primary,
|
| 1362 |
+
button[variant="primary"],
|
| 1363 |
+
.primary > button {
|
| 1364 |
+
background: #4f46e5 !important;
|
| 1365 |
+
border-color: #4f46e5 !important;
|
| 1366 |
+
color: #ffffff !important;
|
| 1367 |
+
font-weight: 600 !important;
|
| 1368 |
+
letter-spacing: 0.01em;
|
| 1369 |
+
border-radius: 8px !important;
|
| 1370 |
+
padding: 10px 28px !important;
|
| 1371 |
+
}
|
| 1372 |
+
button.primary:hover,
|
| 1373 |
+
button[variant="primary"]:hover,
|
| 1374 |
+
.primary > button:hover { background: #4338ca !important; border-color: #4338ca !important; }
|
| 1375 |
+
|
| 1376 |
+
/* The wrapper around the Calculate button — center it, give it sane width */
|
| 1377 |
+
.lc-submit-wrap {
|
| 1378 |
+
display: flex !important;
|
| 1379 |
+
justify-content: center !important;
|
| 1380 |
+
margin: 20px 0 8px 0 !important;
|
| 1381 |
+
}
|
| 1382 |
+
.lc-submit-wrap button {
|
| 1383 |
+
min-width: 220px !important;
|
| 1384 |
+
max-width: 320px !important;
|
| 1385 |
+
width: auto !important;
|
| 1386 |
+
}
|
| 1387 |
+
|
| 1388 |
+
/* Form labels — kill Gradio's purple chip; make labels plain uppercase small text */
|
| 1389 |
+
[data-testid="block-info"] {
|
| 1390 |
+
background: transparent !important;
|
| 1391 |
+
border: none !important;
|
| 1392 |
+
padding: 0 !important;
|
| 1393 |
+
margin: 0 0 6px 0 !important;
|
| 1394 |
+
font-size: 11px !important;
|
| 1395 |
+
font-weight: 600 !important;
|
| 1396 |
+
text-transform: uppercase !important;
|
| 1397 |
+
letter-spacing: 0.05em !important;
|
| 1398 |
+
color: #6b7280 !important;
|
| 1399 |
+
border-radius: 0 !important;
|
| 1400 |
+
display: block !important;
|
| 1401 |
+
}
|
| 1402 |
+
.dark [data-testid="block-info"] { color: #9ca3af !important; }
|
| 1403 |
+
|
| 1404 |
+
/* Tooltip / info-text — single line, secondary color, no italic */
|
| 1405 |
+
.info-text {
|
| 1406 |
+
font-size: 11px !important;
|
| 1407 |
+
color: #94a3b8 !important;
|
| 1408 |
+
margin: 0 0 4px 0 !important;
|
| 1409 |
+
line-height: 1.4 !important;
|
| 1410 |
+
padding: 0 !important;
|
| 1411 |
+
font-style: normal !important;
|
| 1412 |
+
white-space: normal !important;
|
| 1413 |
+
}
|
| 1414 |
+
.info-text br { display: none !important; }
|
| 1415 |
+
.dark .info-text { color: #64748b !important; }
|
| 1416 |
+
|
| 1417 |
+
/* Kill Gradio's grey form-panel chrome entirely — labels + inputs float on the page */
|
| 1418 |
+
.block,
|
| 1419 |
+
.block.padded,
|
| 1420 |
+
.block.gradio-container,
|
| 1421 |
+
.form,
|
| 1422 |
+
.row,
|
| 1423 |
+
[data-testid="block"] {
|
| 1424 |
+
background: transparent !important;
|
| 1425 |
+
border: none !important;
|
| 1426 |
+
box-shadow: none !important;
|
| 1427 |
+
}
|
| 1428 |
+
.block.padded { padding: 6px 0 !important; }
|
| 1429 |
+
.form { padding: 0 !important; }
|
| 1430 |
+
.row { padding: 0 !important; }
|
| 1431 |
+
|
| 1432 |
+
/* Tighten row gap so inputs cluster more naturally */
|
| 1433 |
+
.form, .row { gap: 16px !important; }
|
| 1434 |
+
|
| 1435 |
+
/* Tablet (≤900px): Gradio's gr.Row() flex-direction: row keeps 3 inputs
|
| 1436 |
+
in one line. min-width: 320px forces 3-column rows to wrap to 2x1 +
|
| 1437 |
+
1x1 at this size while leaving 2-column rows at 2-up. */
|
| 1438 |
+
@media (max-width: 900px) {
|
| 1439 |
+
.form,
|
| 1440 |
+
.row {
|
| 1441 |
+
flex-wrap: wrap !important;
|
| 1442 |
+
}
|
| 1443 |
+
.form > .block,
|
| 1444 |
+
.row > .block {
|
| 1445 |
+
flex: 1 1 calc(50% - 12px) !important;
|
| 1446 |
+
min-width: 320px !important;
|
| 1447 |
+
max-width: 100% !important;
|
| 1448 |
+
}
|
| 1449 |
+
}
|
| 1450 |
+
|
| 1451 |
+
/* Mobile (≤540px): single-column form. */
|
| 1452 |
+
@media (max-width: 540px) {
|
| 1453 |
+
.form,
|
| 1454 |
+
.row {
|
| 1455 |
+
flex-direction: column !important;
|
| 1456 |
+
}
|
| 1457 |
+
.form > .block,
|
| 1458 |
+
.row > .block {
|
| 1459 |
+
flex: 1 1 100% !important;
|
| 1460 |
+
min-width: 0 !important;
|
| 1461 |
+
width: 100% !important;
|
| 1462 |
+
}
|
| 1463 |
+
.gradio-container { padding: 12px !important; }
|
| 1464 |
+
.lc-hero-title { font-size: 26px !important; }
|
| 1465 |
+
.lc-pitch-num-bad, .lc-pitch-num-good { font-size: 22px !important; }
|
| 1466 |
+
.lc-pitch-arrow { display: none !important; }
|
| 1467 |
+
}
|
| 1468 |
+
|
| 1469 |
+
/* Inputs themselves — light border, soft fill */
|
| 1470 |
+
input[type="text"],
|
| 1471 |
+
input[type="number"],
|
| 1472 |
+
input[type="password"],
|
| 1473 |
+
textarea,
|
| 1474 |
+
select {
|
| 1475 |
+
border: 1px solid #e5e7eb !important;
|
| 1476 |
+
border-radius: 8px !important;
|
| 1477 |
+
background: #ffffff !important;
|
| 1478 |
+
font-size: 14px !important;
|
| 1479 |
+
padding: 10px 12px !important;
|
| 1480 |
+
}
|
| 1481 |
+
.dark input,
|
| 1482 |
+
.dark textarea,
|
| 1483 |
+
.dark select {
|
| 1484 |
+
background: #111827 !important;
|
| 1485 |
+
border-color: #374151 !important;
|
| 1486 |
+
}
|
| 1487 |
+
input:focus,
|
| 1488 |
+
textarea:focus {
|
| 1489 |
+
border-color: #4f46e5 !important;
|
| 1490 |
+
outline: none !important;
|
| 1491 |
+
box-shadow: 0 0 0 3px rgba(79,70,229,0.12) !important;
|
| 1492 |
+
}
|
| 1493 |
+
|
| 1494 |
+
/* Accordion — Gradio 6 has no .accordion class; the only signal is a .block
|
| 1495 |
+
that *contains* a button.label-wrap. Use :has() to match precisely. */
|
| 1496 |
+
.block.padded:has(> button.label-wrap) {
|
| 1497 |
+
background: #ffffff !important;
|
| 1498 |
+
border: 1px solid #e5e7eb !important;
|
| 1499 |
+
border-radius: 10px !important;
|
| 1500 |
+
margin: 14px 0 !important;
|
| 1501 |
+
padding: 0 !important;
|
| 1502 |
+
overflow: hidden !important;
|
| 1503 |
+
}
|
| 1504 |
+
.dark .block.padded:has(> button.label-wrap) {
|
| 1505 |
+
background: #111827 !important;
|
| 1506 |
+
border-color: #374151 !important;
|
| 1507 |
+
}
|
| 1508 |
+
button.label-wrap {
|
| 1509 |
+
background: #f8fafc !important;
|
| 1510 |
+
padding: 14px 18px !important;
|
| 1511 |
+
font-weight: 600 !important;
|
| 1512 |
+
font-size: 14px !important;
|
| 1513 |
+
color: #1f2937 !important;
|
| 1514 |
+
width: 100% !important;
|
| 1515 |
+
text-align: left !important;
|
| 1516 |
+
cursor: pointer !important;
|
| 1517 |
+
border: none !important;
|
| 1518 |
+
border-bottom: 1px solid #e5e7eb !important;
|
| 1519 |
+
display: flex !important;
|
| 1520 |
+
justify-content: space-between !important;
|
| 1521 |
+
align-items: center !important;
|
| 1522 |
+
letter-spacing: 0.01em;
|
| 1523 |
+
}
|
| 1524 |
+
.dark button.label-wrap {
|
| 1525 |
+
background: #1e293b !important;
|
| 1526 |
+
color: #f1f5f9 !important;
|
| 1527 |
+
border-bottom-color: #374151 !important;
|
| 1528 |
+
}
|
| 1529 |
+
button.label-wrap:hover { background: #f1f5f9 !important; }
|
| 1530 |
+
.dark button.label-wrap:hover { background: #334155 !important; }
|
| 1531 |
+
/* Sibling content of the header (the body when expanded) */
|
| 1532 |
+
.block.padded:has(> button.label-wrap) > *:not(button.label-wrap) {
|
| 1533 |
+
padding: 16px 18px !important;
|
| 1534 |
+
background: #ffffff !important;
|
| 1535 |
+
}
|
| 1536 |
+
.dark .block.padded:has(> button.label-wrap) > *:not(button.label-wrap) {
|
| 1537 |
+
background: #111827 !important;
|
| 1538 |
+
}
|
| 1539 |
+
|
| 1540 |
+
/* gr.Examples table — the default Gradio render is a raw HTML table with black
|
| 1541 |
+
borders and no hover state. Style it to match the rest of the page. */
|
| 1542 |
+
.gradio-dataset,
|
| 1543 |
+
[data-testid="dataset"] {
|
| 1544 |
+
margin-top: 24px !important;
|
| 1545 |
+
background: transparent !important;
|
| 1546 |
+
border: none !important;
|
| 1547 |
+
}
|
| 1548 |
+
.gradio-dataset table,
|
| 1549 |
+
[data-testid="dataset"] table {
|
| 1550 |
+
border-collapse: collapse !important;
|
| 1551 |
+
border: 1px solid #e5e7eb !important;
|
| 1552 |
+
border-radius: 8px !important;
|
| 1553 |
+
overflow: hidden !important;
|
| 1554 |
+
font-size: 13px !important;
|
| 1555 |
+
width: 100% !important;
|
| 1556 |
+
}
|
| 1557 |
+
.dark .gradio-dataset table,
|
| 1558 |
+
.dark [data-testid="dataset"] table { border-color: #374151 !important; }
|
| 1559 |
+
.gradio-dataset thead,
|
| 1560 |
+
[data-testid="dataset"] thead { background: #f9fafb !important; }
|
| 1561 |
+
.dark .gradio-dataset thead,
|
| 1562 |
+
.dark [data-testid="dataset"] thead { background: #111827 !important; }
|
| 1563 |
+
.gradio-dataset th,
|
| 1564 |
+
[data-testid="dataset"] th {
|
| 1565 |
+
font-size: 11px !important;
|
| 1566 |
+
font-weight: 600 !important;
|
| 1567 |
+
text-transform: uppercase !important;
|
| 1568 |
+
letter-spacing: 0.05em !important;
|
| 1569 |
+
color: #6b7280 !important;
|
| 1570 |
+
text-align: left !important;
|
| 1571 |
+
padding: 10px 12px !important;
|
| 1572 |
+
border: none !important;
|
| 1573 |
+
border-bottom: 1px solid #e5e7eb !important;
|
| 1574 |
+
}
|
| 1575 |
+
.gradio-dataset td,
|
| 1576 |
+
[data-testid="dataset"] td {
|
| 1577 |
+
padding: 9px 12px !important;
|
| 1578 |
+
border: none !important;
|
| 1579 |
+
border-bottom: 1px solid #f3f4f6 !important;
|
| 1580 |
+
color: #1f2937 !important;
|
| 1581 |
+
font-size: 13px !important;
|
| 1582 |
+
background: transparent !important;
|
| 1583 |
+
cursor: pointer !important;
|
| 1584 |
+
}
|
| 1585 |
+
.dark .gradio-dataset td,
|
| 1586 |
+
.dark [data-testid="dataset"] td {
|
| 1587 |
+
color: #e5e7eb !important;
|
| 1588 |
+
border-bottom-color: #1f2937 !important;
|
| 1589 |
+
}
|
| 1590 |
+
.gradio-dataset tbody tr:last-child td,
|
| 1591 |
+
[data-testid="dataset"] tbody tr:last-child td { border-bottom: none !important; }
|
| 1592 |
+
.gradio-dataset tbody tr:hover,
|
| 1593 |
+
[data-testid="dataset"] tbody tr:hover { background: rgba(79, 70, 229, 0.04) !important; }
|
| 1594 |
+
.dark .gradio-dataset tbody tr:hover,
|
| 1595 |
+
.dark [data-testid="dataset"] tbody tr:hover { background: rgba(129, 140, 248, 0.08) !important; }
|
| 1596 |
+
|
| 1597 |
+
/* Examples header label — Gradio puts a "Try one of these" label above */
|
| 1598 |
+
.gradio-dataset > .label,
|
| 1599 |
+
[data-testid="dataset"] > .label,
|
| 1600 |
+
.gradio-dataset .block-label,
|
| 1601 |
+
.dataset .block-label {
|
| 1602 |
+
font-size: 11px !important;
|
| 1603 |
+
font-weight: 600 !important;
|
| 1604 |
+
text-transform: uppercase !important;
|
| 1605 |
+
letter-spacing: 0.06em !important;
|
| 1606 |
+
color: #6b7280 !important;
|
| 1607 |
+
background: transparent !important;
|
| 1608 |
+
border: none !important;
|
| 1609 |
+
padding: 0 0 6px 0 !important;
|
| 1610 |
+
margin-bottom: 0 !important;
|
| 1611 |
+
}
|
| 1612 |
+
|
| 1613 |
+
/* Footer link strip */
|
| 1614 |
+
.lc-footer {
|
| 1615 |
+
margin-top: 28px;
|
| 1616 |
+
padding: 14px 0;
|
| 1617 |
+
border-top: 1px solid #e5e7eb;
|
| 1618 |
+
font-size: 13px !important;
|
| 1619 |
+
color: #6b7280 !important;
|
| 1620 |
+
}
|
| 1621 |
+
.dark .lc-footer { border-top-color: #374151; }
|
| 1622 |
+
.lc-footer a { color: #4f46e5 !important; text-decoration: none; }
|
| 1623 |
+
.lc-footer a:hover { text-decoration: underline; }
|
| 1624 |
+
.dark .lc-footer a { color: #818cf8 !important; }
|
| 1625 |
+
|
| 1626 |
+
/* Result wrapper */
|
| 1627 |
+
.lc-result {
|
| 1628 |
+
padding: 4px 0;
|
| 1629 |
+
font-size: 14px;
|
| 1630 |
+
line-height: 1.55;
|
| 1631 |
+
color: #111827 !important;
|
| 1632 |
+
}
|
| 1633 |
+
.dark .lc-result { color: #f3f4f6 !important; }
|
| 1634 |
+
|
| 1635 |
+
/* Headline */
|
| 1636 |
+
.lc-header { padding: 4px 0 14px 0; border-bottom: 1px solid #e5e7eb; }
|
| 1637 |
+
.dark .lc-header { border-bottom-color: #374151; }
|
| 1638 |
+
.lc-title {
|
| 1639 |
+
font-size: 22px !important;
|
| 1640 |
+
font-weight: 700 !important;
|
| 1641 |
+
letter-spacing: -0.01em;
|
| 1642 |
+
color: #0f172a !important;
|
| 1643 |
+
}
|
| 1644 |
+
.dark .lc-title { color: #f8fafc !important; }
|
| 1645 |
+
.lc-subtitle {
|
| 1646 |
+
font-size: 13px !important;
|
| 1647 |
+
color: #6b7280 !important;
|
| 1648 |
+
margin-top: 2px;
|
| 1649 |
+
}
|
| 1650 |
+
|
| 1651 |
+
/* Headline stat cards */
|
| 1652 |
+
.lc-stats {
|
| 1653 |
+
display: grid;
|
| 1654 |
+
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
|
| 1655 |
+
gap: 12px;
|
| 1656 |
+
margin: 16px 0 8px 0;
|
| 1657 |
+
}
|
| 1658 |
+
.lc-stat {
|
| 1659 |
+
border: 1px solid #e5e7eb;
|
| 1660 |
+
border-radius: 10px;
|
| 1661 |
+
padding: 14px 16px;
|
| 1662 |
+
background: #ffffff;
|
| 1663 |
+
}
|
| 1664 |
+
.dark .lc-stat { background: #111827; border-color: #374151; }
|
| 1665 |
+
.lc-stat-value {
|
| 1666 |
+
font-size: 24px !important;
|
| 1667 |
+
font-weight: 700 !important;
|
| 1668 |
+
letter-spacing: -0.01em;
|
| 1669 |
+
line-height: 1.2;
|
| 1670 |
+
color: #0f172a !important;
|
| 1671 |
+
}
|
| 1672 |
+
.dark .lc-stat-value { color: #f8fafc !important; }
|
| 1673 |
+
.lc-stat-label {
|
| 1674 |
+
font-size: 11px !important;
|
| 1675 |
+
text-transform: uppercase;
|
| 1676 |
+
letter-spacing: 0.05em;
|
| 1677 |
+
color: #6b7280 !important;
|
| 1678 |
+
margin-top: 4px;
|
| 1679 |
+
font-weight: 500 !important;
|
| 1680 |
+
}
|
| 1681 |
+
.lc-stat-sub {
|
| 1682 |
+
font-size: 11px !important;
|
| 1683 |
+
color: #9ca3af !important;
|
| 1684 |
+
margin-top: 2px;
|
| 1685 |
+
}
|
| 1686 |
+
.lc-stat-chip { margin-top: 10px; }
|
| 1687 |
+
|
| 1688 |
+
.lc-chip {
|
| 1689 |
+
display: inline-block;
|
| 1690 |
+
padding: 2px 8px;
|
| 1691 |
+
border-radius: 999px;
|
| 1692 |
+
font-size: 11px !important;
|
| 1693 |
+
font-weight: 600 !important;
|
| 1694 |
+
letter-spacing: 0.02em;
|
| 1695 |
+
}
|
| 1696 |
+
|
| 1697 |
+
.lc-prov {
|
| 1698 |
+
margin-top: 6px;
|
| 1699 |
+
font-size: 12px !important;
|
| 1700 |
+
color: #6b7280 !important;
|
| 1701 |
+
font-style: italic;
|
| 1702 |
+
}
|
| 1703 |
+
|
| 1704 |
+
/* Sections */
|
| 1705 |
+
.lc-section { margin: 24px 0 0 0; }
|
| 1706 |
+
.lc-section h3 {
|
| 1707 |
+
font-size: 13px !important;
|
| 1708 |
+
font-weight: 600 !important;
|
| 1709 |
+
text-transform: uppercase;
|
| 1710 |
+
letter-spacing: 0.06em;
|
| 1711 |
+
color: #6b7280 !important;
|
| 1712 |
+
margin: 0 0 6px 0 !important;
|
| 1713 |
+
}
|
| 1714 |
+
.lc-section-help {
|
| 1715 |
+
font-size: 12px !important;
|
| 1716 |
+
color: #6b7280 !important;
|
| 1717 |
+
margin: 0 0 10px 0;
|
| 1718 |
+
line-height: 1.5;
|
| 1719 |
+
}
|
| 1720 |
+
|
| 1721 |
+
/* Tables */
|
| 1722 |
+
.lc-table {
|
| 1723 |
+
width: 100%;
|
| 1724 |
+
border-collapse: collapse;
|
| 1725 |
+
font-size: 13px !important;
|
| 1726 |
+
color: #111827 !important;
|
| 1727 |
+
}
|
| 1728 |
+
.dark .lc-table { color: #f3f4f6 !important; }
|
| 1729 |
+
.lc-table th, .lc-table td {
|
| 1730 |
+
padding: 8px 10px;
|
| 1731 |
+
border-bottom: 1px solid #f3f4f6;
|
| 1732 |
+
text-align: left;
|
| 1733 |
+
}
|
| 1734 |
+
.dark .lc-table th, .dark .lc-table td { border-bottom-color: #1f2937; }
|
| 1735 |
+
.lc-table th {
|
| 1736 |
+
font-size: 11px !important;
|
| 1737 |
+
text-transform: uppercase;
|
| 1738 |
+
letter-spacing: 0.04em;
|
| 1739 |
+
color: #6b7280 !important;
|
| 1740 |
+
font-weight: 500 !important;
|
| 1741 |
+
}
|
| 1742 |
+
.lc-table-recon td:nth-child(2),
|
| 1743 |
+
.lc-table-recon td:nth-child(3) { text-align: right; }
|
| 1744 |
+
.lc-best { background: rgba(22, 163, 74, 0.08); }
|
| 1745 |
+
.dark .lc-best { background: rgba(22, 163, 74, 0.18); }
|
| 1746 |
+
|
| 1747 |
+
/* Performance grid */
|
| 1748 |
+
.lc-perf {
|
| 1749 |
+
display: grid;
|
| 1750 |
+
grid-template-columns: repeat(auto-fit, minmax(170px, 1fr));
|
| 1751 |
+
gap: 12px;
|
| 1752 |
+
}
|
| 1753 |
+
.lc-perf-item {
|
| 1754 |
+
border: 1px solid #e5e7eb;
|
| 1755 |
+
border-radius: 10px;
|
| 1756 |
+
padding: 12px 14px;
|
| 1757 |
+
background: #ffffff;
|
| 1758 |
+
}
|
| 1759 |
+
.dark .lc-perf-item { border-color: #374151; background: #111827; }
|
| 1760 |
+
.lc-perf-value {
|
| 1761 |
+
font-size: 20px !important;
|
| 1762 |
+
font-weight: 700 !important;
|
| 1763 |
+
letter-spacing: -0.01em;
|
| 1764 |
+
color: #0f172a !important;
|
| 1765 |
+
line-height: 1.2;
|
| 1766 |
+
}
|
| 1767 |
+
.dark .lc-perf-value { color: #f8fafc !important; }
|
| 1768 |
+
.lc-perf-value code {
|
| 1769 |
+
font-size: 16px !important;
|
| 1770 |
+
font-weight: 600 !important;
|
| 1771 |
+
background: transparent !important;
|
| 1772 |
+
color: #0f172a !important;
|
| 1773 |
+
padding: 0 !important;
|
| 1774 |
+
}
|
| 1775 |
+
.dark .lc-perf-value code { color: #f8fafc !important; }
|
| 1776 |
+
.lc-perf-label {
|
| 1777 |
+
font-size: 11px !important;
|
| 1778 |
+
text-transform: uppercase;
|
| 1779 |
+
letter-spacing: 0.05em;
|
| 1780 |
+
color: #6b7280 !important;
|
| 1781 |
+
margin-top: 4px;
|
| 1782 |
+
font-weight: 500 !important;
|
| 1783 |
+
}
|
| 1784 |
+
.lc-perf-sub {
|
| 1785 |
+
font-size: 11px !important;
|
| 1786 |
+
color: #9ca3af !important;
|
| 1787 |
+
margin-top: 1px;
|
| 1788 |
+
}
|
| 1789 |
+
|
| 1790 |
+
/* Inline code */
|
| 1791 |
+
.lc-result code {
|
| 1792 |
+
font-family: "SF Mono", "JetBrains Mono", Menlo, Consolas, monospace !important;
|
| 1793 |
+
font-size: 0.92em !important;
|
| 1794 |
+
color: #0f172a !important;
|
| 1795 |
+
background: rgba(15, 23, 42, 0.06);
|
| 1796 |
+
padding: 1px 5px;
|
| 1797 |
+
border-radius: 4px;
|
| 1798 |
+
}
|
| 1799 |
+
.dark .lc-result code {
|
| 1800 |
+
color: #e2e8f0 !important;
|
| 1801 |
+
background: rgba(226, 232, 240, 0.08);
|
| 1802 |
+
}
|
| 1803 |
+
|
| 1804 |
+
/* Generated command — ALWAYS dark theme regardless of mode */
|
| 1805 |
+
.lc-cmd {
|
| 1806 |
+
background: #0b1220 !important;
|
| 1807 |
+
color: #f1f5f9 !important;
|
| 1808 |
+
padding: 16px 18px !important;
|
| 1809 |
+
border-radius: 8px;
|
| 1810 |
+
font-size: 12.5px !important;
|
| 1811 |
+
overflow-x: auto;
|
| 1812 |
+
white-space: pre;
|
| 1813 |
+
border: 1px solid #1e293b !important;
|
| 1814 |
+
margin: 0 !important;
|
| 1815 |
+
}
|
| 1816 |
+
.lc-cmd code {
|
| 1817 |
+
font-family: "SF Mono", "JetBrains Mono", Menlo, Consolas, monospace !important;
|
| 1818 |
+
background: transparent !important;
|
| 1819 |
+
color: #f1f5f9 !important;
|
| 1820 |
+
padding: 0 !important;
|
| 1821 |
+
font-size: 12.5px !important;
|
| 1822 |
+
border-radius: 0 !important;
|
| 1823 |
+
}
|
| 1824 |
+
|
| 1825 |
+
/* Comparison view — side-by-side metrics across GPUs */
|
| 1826 |
+
.lc-cmp-wrap {
|
| 1827 |
+
overflow-x: auto;
|
| 1828 |
+
margin: 8px 0 12px 0;
|
| 1829 |
+
border: 1px solid #e5e7eb;
|
| 1830 |
+
border-radius: 10px;
|
| 1831 |
+
background: #ffffff;
|
| 1832 |
+
}
|
| 1833 |
+
.dark .lc-cmp-wrap { background: #111827; border-color: #374151; }
|
| 1834 |
+
.lc-cmp-table {
|
| 1835 |
+
width: 100%;
|
| 1836 |
+
border-collapse: collapse;
|
| 1837 |
+
font-size: 13px !important;
|
| 1838 |
+
}
|
| 1839 |
+
.lc-cmp-table th,
|
| 1840 |
+
.lc-cmp-table td {
|
| 1841 |
+
padding: 10px 12px;
|
| 1842 |
+
text-align: left;
|
| 1843 |
+
border-bottom: 1px solid #f3f4f6;
|
| 1844 |
+
}
|
| 1845 |
+
.dark .lc-cmp-table th,
|
| 1846 |
+
.dark .lc-cmp-table td { border-bottom-color: #1f2937; }
|
| 1847 |
+
.lc-cmp-table thead th {
|
| 1848 |
+
font-size: 11px !important;
|
| 1849 |
+
text-transform: uppercase;
|
| 1850 |
+
letter-spacing: 0.05em;
|
| 1851 |
+
color: #6b7280 !important;
|
| 1852 |
+
font-weight: 600 !important;
|
| 1853 |
+
background: #f9fafb;
|
| 1854 |
+
}
|
| 1855 |
+
.dark .lc-cmp-table thead th { background: #1e293b; color: #9ca3af !important; }
|
| 1856 |
+
.lc-cmp-row-label {
|
| 1857 |
+
font-size: 12px !important;
|
| 1858 |
+
color: #6b7280 !important;
|
| 1859 |
+
font-weight: 600 !important;
|
| 1860 |
+
white-space: nowrap;
|
| 1861 |
+
}
|
| 1862 |
+
.lc-cmp-row-info {
|
| 1863 |
+
font-style: italic;
|
| 1864 |
+
color: #9ca3af !important;
|
| 1865 |
+
}
|
| 1866 |
+
.dark .lc-cmp-row-info { color: #6b7280 !important; }
|
| 1867 |
+
.lc-cmp-tr-info td {
|
| 1868 |
+
color: #6b7280;
|
| 1869 |
+
background: #fafafa;
|
| 1870 |
+
}
|
| 1871 |
+
.dark .lc-cmp-tr-info td { color: #9ca3af; background: #0f172a; }
|
| 1872 |
+
.lc-cmp-gpu {
|
| 1873 |
+
font-family: "SF Mono", "JetBrains Mono", Menlo, monospace !important;
|
| 1874 |
+
font-size: 12px !important;
|
| 1875 |
+
}
|
| 1876 |
+
.lc-cmp-table tbody tr:last-child td { border-bottom: none; }
|
| 1877 |
+
.lc-cmp-winner {
|
| 1878 |
+
background: rgba(22, 163, 74, 0.10) !important;
|
| 1879 |
+
font-weight: 700 !important;
|
| 1880 |
+
color: #15803d !important;
|
| 1881 |
+
position: relative;
|
| 1882 |
+
}
|
| 1883 |
+
.dark .lc-cmp-winner { background: rgba(74, 222, 128, 0.15) !important; color: #4ade80 !important; }
|
| 1884 |
+
.lc-cmp-winner::before {
|
| 1885 |
+
content: "✓ ";
|
| 1886 |
+
font-size: 11px;
|
| 1887 |
+
font-weight: 700;
|
| 1888 |
+
color: #15803d;
|
| 1889 |
+
margin-right: 2px;
|
| 1890 |
+
}
|
| 1891 |
+
.dark .lc-cmp-winner::before { color: #4ade80; }
|
| 1892 |
+
.lc-cmp-summary {
|
| 1893 |
+
margin-top: 12px;
|
| 1894 |
+
padding: 12px 14px;
|
| 1895 |
+
border-radius: 8px;
|
| 1896 |
+
background: #eef2ff;
|
| 1897 |
+
border: 1px solid #c7d2fe;
|
| 1898 |
+
font-size: 13px !important;
|
| 1899 |
+
color: #312e81 !important;
|
| 1900 |
+
}
|
| 1901 |
+
.dark .lc-cmp-summary {
|
| 1902 |
+
background: #1e1b4b;
|
| 1903 |
+
border-color: #3730a3;
|
| 1904 |
+
color: #e0e7ff !important;
|
| 1905 |
+
}
|
| 1906 |
+
.lc-cmp-summary strong { color: #4338ca; }
|
| 1907 |
+
.dark .lc-cmp-summary strong { color: #a5b4fc; }
|
| 1908 |
+
|
| 1909 |
+
/* Per-GPU detail cards under the table */
|
| 1910 |
+
.lc-cmp-details {
|
| 1911 |
+
display: grid;
|
| 1912 |
+
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
|
| 1913 |
+
gap: 12px;
|
| 1914 |
+
}
|
| 1915 |
+
.lc-cmp-detail {
|
| 1916 |
+
border: 1px solid #e5e7eb;
|
| 1917 |
+
border-radius: 10px;
|
| 1918 |
+
padding: 12px 14px;
|
| 1919 |
+
background: #ffffff;
|
| 1920 |
+
}
|
| 1921 |
+
.dark .lc-cmp-detail { background: #111827; border-color: #374151; }
|
| 1922 |
+
.lc-cmp-detail-gpu {
|
| 1923 |
+
font-family: "SF Mono", "JetBrains Mono", Menlo, monospace !important;
|
| 1924 |
+
font-size: 13px !important;
|
| 1925 |
+
font-weight: 700 !important;
|
| 1926 |
+
color: #0f172a !important;
|
| 1927 |
+
margin-bottom: 6px;
|
| 1928 |
+
padding-bottom: 6px;
|
| 1929 |
+
border-bottom: 1px solid #e5e7eb;
|
| 1930 |
+
}
|
| 1931 |
+
.dark .lc-cmp-detail-gpu { color: #f8fafc !important; border-bottom-color: #374151; }
|
| 1932 |
+
.lc-cmp-detail-row {
|
| 1933 |
+
display: flex;
|
| 1934 |
+
justify-content: space-between;
|
| 1935 |
+
font-size: 12px !important;
|
| 1936 |
+
padding: 3px 0;
|
| 1937 |
+
}
|
| 1938 |
+
.lc-cmp-detail-row span { color: #6b7280 !important; }
|
| 1939 |
+
.lc-cmp-detail-row strong {
|
| 1940 |
+
color: #0f172a !important;
|
| 1941 |
+
font-size: 13px !important;
|
| 1942 |
+
}
|
| 1943 |
+
.dark .lc-cmp-detail-row strong { color: #f8fafc !important; }
|
| 1944 |
+
|
| 1945 |
+
/* Star-on-GitHub CTA — shown at the bottom of the result, capturing the
|
| 1946 |
+
peak-satisfaction moment. Card-style with indigo accent so it reads as
|
| 1947 |
+
"thanks", not as a banner ad. */
|
| 1948 |
+
.lc-star-cta {
|
| 1949 |
+
display: flex;
|
| 1950 |
+
align-items: center;
|
| 1951 |
+
gap: 14px;
|
| 1952 |
+
margin: 28px 0 8px 0;
|
| 1953 |
+
padding: 14px 18px;
|
| 1954 |
+
border: 1px solid #c7d2fe;
|
| 1955 |
+
background: #eef2ff;
|
| 1956 |
+
border-radius: 10px;
|
| 1957 |
+
text-decoration: none !important;
|
| 1958 |
+
color: #312e81 !important;
|
| 1959 |
+
transition: background 0.15s ease, border-color 0.15s ease, transform 0.1s ease;
|
| 1960 |
+
}
|
| 1961 |
+
.lc-star-cta:hover {
|
| 1962 |
+
background: #e0e7ff;
|
| 1963 |
+
border-color: #a5b4fc;
|
| 1964 |
+
}
|
| 1965 |
+
.lc-star-cta:active { transform: scale(0.995); }
|
| 1966 |
+
.dark .lc-star-cta {
|
| 1967 |
+
background: #1e1b4b;
|
| 1968 |
+
border-color: #3730a3;
|
| 1969 |
+
color: #c7d2fe !important;
|
| 1970 |
+
}
|
| 1971 |
+
.dark .lc-star-cta:hover { background: #312e81; }
|
| 1972 |
+
.lc-star-cta svg { flex: 0 0 auto; color: #4338ca; }
|
| 1973 |
+
.dark .lc-star-cta svg { color: #a5b4fc; }
|
| 1974 |
+
.lc-star-cta-text { flex: 1 1 auto; min-width: 0; }
|
| 1975 |
+
.lc-star-cta-q {
|
| 1976 |
+
font-size: 14px !important;
|
| 1977 |
+
font-weight: 600 !important;
|
| 1978 |
+
line-height: 1.3;
|
| 1979 |
+
color: #312e81 !important;
|
| 1980 |
+
}
|
| 1981 |
+
.dark .lc-star-cta-q { color: #e0e7ff !important; }
|
| 1982 |
+
.lc-star-cta-q-en {
|
| 1983 |
+
font-size: 12px !important;
|
| 1984 |
+
color: #6366f1 !important;
|
| 1985 |
+
margin-top: 2px;
|
| 1986 |
+
line-height: 1.3;
|
| 1987 |
+
}
|
| 1988 |
+
.dark .lc-star-cta-q-en { color: #a5b4fc !important; }
|
| 1989 |
+
.lc-star-cta-action {
|
| 1990 |
+
flex: 0 0 auto;
|
| 1991 |
+
font-size: 13px !important;
|
| 1992 |
+
font-weight: 700 !important;
|
| 1993 |
+
color: #4338ca !important;
|
| 1994 |
+
white-space: nowrap;
|
| 1995 |
+
}
|
| 1996 |
+
.dark .lc-star-cta-action { color: #c7d2fe !important; }
|
| 1997 |
+
@media (max-width: 540px) {
|
| 1998 |
+
.lc-star-cta { flex-wrap: wrap; gap: 10px; }
|
| 1999 |
+
.lc-star-cta-action { flex-basis: 100%; }
|
| 2000 |
+
}
|
| 2001 |
+
|
| 2002 |
+
/* Loading + error */
|
| 2003 |
+
.lc-loading {
|
| 2004 |
+
display: flex;
|
| 2005 |
+
align-items: center;
|
| 2006 |
+
gap: 14px;
|
| 2007 |
+
padding: 24px;
|
| 2008 |
+
color: #6b7280 !important;
|
| 2009 |
+
font-size: 14px !important;
|
| 2010 |
+
}
|
| 2011 |
+
.lc-spinner {
|
| 2012 |
+
width: 18px; height: 18px;
|
| 2013 |
+
border: 2px solid #cbd5e1;
|
| 2014 |
+
border-top-color: #4f46e5;
|
| 2015 |
+
border-radius: 50%;
|
| 2016 |
+
animation: lc-spin 0.7s linear infinite;
|
| 2017 |
+
flex: none;
|
| 2018 |
+
}
|
| 2019 |
+
@keyframes lc-spin { to { transform: rotate(360deg); } }
|
| 2020 |
+
|
| 2021 |
+
.lc-error pre {
|
| 2022 |
+
background: #fef2f2;
|
| 2023 |
+
color: #991b1b !important;
|
| 2024 |
+
padding: 12px 14px;
|
| 2025 |
+
border-radius: 8px;
|
| 2026 |
+
border: 1px solid #fecaca;
|
| 2027 |
+
font-size: 12px !important;
|
| 2028 |
+
white-space: pre-wrap;
|
| 2029 |
+
word-break: break-word;
|
| 2030 |
+
margin: 0;
|
| 2031 |
+
}
|
| 2032 |
+
.dark .lc-error pre { background: #450a0a; color: #fca5a5 !important; border-color: #7f1d1d; }
|
| 2033 |
+
|
| 2034 |
+
/* Explain trace */
|
| 2035 |
+
.lc-explain-entry {
|
| 2036 |
+
margin: 14px 0;
|
| 2037 |
+
padding: 14px 16px;
|
| 2038 |
+
border: 1px solid #e5e7eb;
|
| 2039 |
+
border-left: 3px solid #4f46e5;
|
| 2040 |
+
border-radius: 8px;
|
| 2041 |
+
background: #fafafa;
|
| 2042 |
+
}
|
| 2043 |
+
.dark .lc-explain-entry { background: #0f172a; border-color: #374151; border-left-color: #818cf8; }
|
| 2044 |
+
.lc-explain-heading {
|
| 2045 |
+
font-weight: 700 !important;
|
| 2046 |
+
font-size: 14px !important;
|
| 2047 |
+
margin-bottom: 8px;
|
| 2048 |
+
color: #0f172a !important;
|
| 2049 |
+
}
|
| 2050 |
+
.dark .lc-explain-heading { color: #f8fafc !important; }
|
| 2051 |
+
.lc-explain-formula {
|
| 2052 |
+
margin: 6px 0;
|
| 2053 |
+
font-size: 12.5px !important;
|
| 2054 |
+
}
|
| 2055 |
+
.lc-explain-formula code {
|
| 2056 |
+
background: rgba(79, 70, 229, 0.08) !important;
|
| 2057 |
+
color: #4338ca !important;
|
| 2058 |
+
padding: 4px 8px !important;
|
| 2059 |
+
border-radius: 4px;
|
| 2060 |
+
}
|
| 2061 |
+
.dark .lc-explain-formula code { color: #a5b4fc !important; background: rgba(165, 180, 252, 0.12) !important; }
|
| 2062 |
+
.lc-explain-inputs, .lc-explain-steps {
|
| 2063 |
+
margin: 6px 0 6px 1.2em;
|
| 2064 |
+
font-size: 12.5px !important;
|
| 2065 |
+
line-height: 1.7;
|
| 2066 |
+
}
|
| 2067 |
+
.lc-explain-label {
|
| 2068 |
+
font-size: 11px !important;
|
| 2069 |
+
color: #6b7280 !important;
|
| 2070 |
+
font-style: italic;
|
| 2071 |
+
}
|
| 2072 |
+
.lc-explain-result {
|
| 2073 |
+
margin-top: 8px;
|
| 2074 |
+
padding-top: 8px;
|
| 2075 |
+
border-top: 1px dashed #e5e7eb;
|
| 2076 |
+
font-size: 13px !important;
|
| 2077 |
+
color: #0f172a !important;
|
| 2078 |
+
}
|
| 2079 |
+
.dark .lc-explain-result { color: #f8fafc !important; border-top-color: #374151; }
|
| 2080 |
+
|
| 2081 |
+
/* LLM review */
|
| 2082 |
+
.lc-llm-banner {
|
| 2083 |
+
display: flex;
|
| 2084 |
+
align-items: center;
|
| 2085 |
+
gap: 8px;
|
| 2086 |
+
padding: 8px 12px;
|
| 2087 |
+
background: #f9fafb;
|
| 2088 |
+
border: 1px solid #e5e7eb;
|
| 2089 |
+
border-radius: 8px;
|
| 2090 |
+
font-size: 12px !important;
|
| 2091 |
+
color: #4b5563 !important;
|
| 2092 |
+
margin-bottom: 12px;
|
| 2093 |
+
}
|
| 2094 |
+
.dark .lc-llm-banner { color: #d1d5db !important; background: #111827; border-color: #374151; }
|
| 2095 |
+
.lc-llm-model {
|
| 2096 |
+
font-size: 11px !important;
|
| 2097 |
+
color: #6b7280 !important;
|
| 2098 |
+
font-weight: 500 !important;
|
| 2099 |
+
margin-left: 6px;
|
| 2100 |
+
text-transform: none !important;
|
| 2101 |
+
letter-spacing: 0 !important;
|
| 2102 |
+
}
|
| 2103 |
+
.lc-llm-content {
|
| 2104 |
+
font-size: 13px !important;
|
| 2105 |
+
line-height: 1.7;
|
| 2106 |
+
color: #0f172a !important;
|
| 2107 |
+
padding: 12px 14px;
|
| 2108 |
+
border: 1px solid #e5e7eb;
|
| 2109 |
+
border-radius: 8px;
|
| 2110 |
+
background: #ffffff;
|
| 2111 |
+
}
|
| 2112 |
+
.dark .lc-llm-content { color: #f3f4f6 !important; background: #111827; border-color: #374151; }
|
| 2113 |
+
"""
|
| 2114 |
+
|
| 2115 |
+
|
| 2116 |
+
def _build_ui() -> gr.Blocks:
|
| 2117 |
+
with gr.Blocks(title="llm-cal — LLM hardware calculator") as demo:
|
| 2118 |
+
gr.HTML(HERO_HTML)
|
| 2119 |
+
|
| 2120 |
+
# ---- Required ----------------------------------------------------
|
| 2121 |
+
with gr.Row():
|
| 2122 |
+
model_id = gr.Textbox(
|
| 2123 |
+
label="Model ID · 模型 ID",
|
| 2124 |
+
placeholder="e.g. deepseek-ai/DeepSeek-V4-Flash",
|
| 2125 |
+
info="Repo id · 仓库 ID(owner/name)",
|
| 2126 |
+
scale=3,
|
| 2127 |
+
)
|
| 2128 |
+
source = gr.Radio(
|
| 2129 |
+
choices=["HuggingFace", "ModelScope"],
|
| 2130 |
+
value="HuggingFace",
|
| 2131 |
+
label="Source · 来源",
|
| 2132 |
+
info="Where to pull model metadata · 拉取来源",
|
| 2133 |
+
scale=2,
|
| 2134 |
+
)
|
| 2135 |
+
|
| 2136 |
+
with gr.Row():
|
| 2137 |
+
vendor = gr.Dropdown(
|
| 2138 |
+
choices=VENDOR_CHOICES_EN,
|
| 2139 |
+
value=DEFAULT_VENDOR,
|
| 2140 |
+
label="GPU vendor · GPU 厂商",
|
| 2141 |
+
info="11 vendors covered · 共 11 家",
|
| 2142 |
+
scale=1,
|
| 2143 |
+
)
|
| 2144 |
+
gpu = gr.Dropdown(
|
| 2145 |
+
choices=_VENDOR_TO_GPUS[DEFAULT_VENDOR],
|
| 2146 |
+
value=[DEFAULT_GPU],
|
| 2147 |
+
label="GPU model · GPU 型号",
|
| 2148 |
+
info="One GPU = single eval. 2-4 = compare side-by-side · 选 1 张单评估,2-4 张对比",
|
| 2149 |
+
scale=2,
|
| 2150 |
+
multiselect=True,
|
| 2151 |
+
max_choices=4,
|
| 2152 |
+
allow_custom_value=True,
|
| 2153 |
+
)
|
| 2154 |
+
|
| 2155 |
+
with gr.Row():
|
| 2156 |
+
engine = gr.Radio(
|
| 2157 |
+
choices=["vllm", "sglang"],
|
| 2158 |
+
value="vllm",
|
| 2159 |
+
label="Engine · 引擎",
|
| 2160 |
+
info="Inference engine · 推理引擎",
|
| 2161 |
+
)
|
| 2162 |
+
context_length = gr.Number(
|
| 2163 |
+
label="Context length · Context 长度",
|
| 2164 |
+
value=None,
|
| 2165 |
+
precision=0,
|
| 2166 |
+
info="Empty = 4K/32K/128K/1M · 留空显示全档",
|
| 2167 |
+
)
|
| 2168 |
+
lang = gr.Radio(
|
| 2169 |
+
choices=["English", "中文"],
|
| 2170 |
+
value="English",
|
| 2171 |
+
label="Output language · 输出语言",
|
| 2172 |
+
info="Result area only · 仅影响下方结果区",
|
| 2173 |
+
)
|
| 2174 |
+
|
| 2175 |
+
# ---- Performance tuning (collapsible) ----------------------------
|
| 2176 |
+
with gr.Accordion("Performance tuning · 性能参数", open=False):
|
| 2177 |
+
with gr.Row():
|
| 2178 |
+
input_tokens = gr.Number(
|
| 2179 |
+
label="Input tokens · 输入 tokens",
|
| 2180 |
+
value=2000,
|
| 2181 |
+
precision=0,
|
| 2182 |
+
info="Prefill budget · Prefill 预算",
|
| 2183 |
+
)
|
| 2184 |
+
output_tokens = gr.Number(
|
| 2185 |
+
label="Output tokens · 输出 tokens",
|
| 2186 |
+
value=512,
|
| 2187 |
+
precision=0,
|
| 2188 |
+
info="Decode budget · Decode 预算",
|
| 2189 |
+
)
|
| 2190 |
+
target_tps = gr.Number(
|
| 2191 |
+
label="Target tok/s/user · 单用户目标 tok/s",
|
| 2192 |
+
value=30.0,
|
| 2193 |
+
info="SLA per user · 单用户 SLA(30 ≈ 流畅阅读)",
|
| 2194 |
+
)
|
| 2195 |
+
with gr.Row():
|
| 2196 |
+
prefill_util = gr.Number(
|
| 2197 |
+
label="Prefill util · Prefill 利用率",
|
| 2198 |
+
value=0.40,
|
| 2199 |
+
info="0–1 · 0.40 = vLLM paper baseline",
|
| 2200 |
+
)
|
| 2201 |
+
decode_bw_util = gr.Number(
|
| 2202 |
+
label="Decode BW util · Decode 带宽利用率",
|
| 2203 |
+
value=0.50,
|
| 2204 |
+
info="0–1 · 0.50 = community median",
|
| 2205 |
+
)
|
| 2206 |
+
concurrency_degradation = gr.Number(
|
| 2207 |
+
label="Concurrency degradation · 并发衰减",
|
| 2208 |
+
value=1.0,
|
| 2209 |
+
info="1.0 = honest · 1.67 = 60% efficiency under load",
|
| 2210 |
+
)
|
| 2211 |
+
|
| 2212 |
+
# ---- Advanced (collapsible) --------------------------------------
|
| 2213 |
+
with gr.Accordion("Advanced · 高级", open=False):
|
| 2214 |
+
with gr.Row():
|
| 2215 |
+
hf_token = gr.Textbox(
|
| 2216 |
+
label="HF_TOKEN",
|
| 2217 |
+
value="",
|
| 2218 |
+
placeholder="hf_...",
|
| 2219 |
+
type="password",
|
| 2220 |
+
info="For gated HF models · 私有 HF 模型用",
|
| 2221 |
+
)
|
| 2222 |
+
ms_token = gr.Textbox(
|
| 2223 |
+
label="MODELSCOPE_API_TOKEN",
|
| 2224 |
+
value="",
|
| 2225 |
+
placeholder="ms-...",
|
| 2226 |
+
type="password",
|
| 2227 |
+
info="For gated MS models · 私有 MS 模型用",
|
| 2228 |
+
)
|
| 2229 |
+
with gr.Row():
|
| 2230 |
+
gpu_count = gr.Number(
|
| 2231 |
+
label="Force GPU count · 强制 GPU 数",
|
| 2232 |
+
value=None,
|
| 2233 |
+
precision=0,
|
| 2234 |
+
info="Empty = auto min/dev/prod · 留空自动给三档",
|
| 2235 |
+
)
|
| 2236 |
+
refresh = gr.Checkbox(
|
| 2237 |
+
label="Refresh cache · 刷新缓存",
|
| 2238 |
+
value=False,
|
| 2239 |
+
info="Bypass diskcache · 跳过本地缓存",
|
| 2240 |
+
)
|
| 2241 |
+
with gr.Row():
|
| 2242 |
+
explain = gr.Checkbox(
|
| 2243 |
+
label="--explain · 推导链",
|
| 2244 |
+
value=False,
|
| 2245 |
+
info="Full derivation trace · 输出完整推导链",
|
| 2246 |
+
)
|
| 2247 |
+
llm_review = gr.Checkbox(
|
| 2248 |
+
label="--llm-review · LLM 审计",
|
| 2249 |
+
value=False,
|
| 2250 |
+
info="Second opinion from an LLM · 第二意见审计",
|
| 2251 |
+
)
|
| 2252 |
+
with gr.Row():
|
| 2253 |
+
llm_api_key = gr.Textbox(
|
| 2254 |
+
label="LLM API key · LLM API 密钥",
|
| 2255 |
+
value="",
|
| 2256 |
+
placeholder="sk-...",
|
| 2257 |
+
type="password",
|
| 2258 |
+
info="OpenAI-compatible endpoint · OpenAI 兼容端点",
|
| 2259 |
+
)
|
| 2260 |
+
llm_base_url = gr.Textbox(
|
| 2261 |
+
label="LLM base URL · LLM 基地址",
|
| 2262 |
+
value="",
|
| 2263 |
+
placeholder="https://api.openai.com/v1",
|
| 2264 |
+
info="e.g. https://api.deepseek.com/v1",
|
| 2265 |
+
)
|
| 2266 |
+
llm_model = gr.Textbox(
|
| 2267 |
+
label="LLM model · LLM 模型名",
|
| 2268 |
+
value="",
|
| 2269 |
+
placeholder="gpt-4o",
|
| 2270 |
+
info="e.g. gpt-4o / deepseek-chat / MiniMax-M2",
|
| 2271 |
+
)
|
| 2272 |
+
|
| 2273 |
+
with gr.Row(elem_classes="lc-submit-wrap"):
|
| 2274 |
+
submit = gr.Button("Calculate · 计算", variant="primary", size="lg")
|
| 2275 |
+
|
| 2276 |
+
# Three output panes — main always shows, explain/llm-review only when toggled
|
| 2277 |
+
output_main = gr.HTML(label="Result")
|
| 2278 |
+
output_explain = gr.HTML(label="Explain trace")
|
| 2279 |
+
output_llm = gr.HTML(label="LLM review")
|
| 2280 |
+
|
| 2281 |
+
gr.Examples(
|
| 2282 |
+
examples=[
|
| 2283 |
+
# gpu wrapped in a list — the Dropdown is multiselect now
|
| 2284 |
+
[m, v, [g], e, None, "English", s]
|
| 2285 |
+
for m, v, g, e, s in EXAMPLE_MODELS
|
| 2286 |
+
],
|
| 2287 |
+
inputs=[model_id, vendor, gpu, engine, context_length, lang, source],
|
| 2288 |
+
label="Try one of these · 试试这些组合",
|
| 2289 |
+
)
|
| 2290 |
+
|
| 2291 |
+
gr.HTML(
|
| 2292 |
+
"<div class='lc-footer'>"
|
| 2293 |
+
"<a href='https://github.com/FlyTOmeLight/llm-cal' target='_blank'>GitHub</a> · "
|
| 2294 |
+
"<a href='https://flytomelight.github.io/llm-cal/' target='_blank'>Docs</a> · "
|
| 2295 |
+
"<a href='https://flytomelight.github.io/llm-cal/methodology/' target='_blank'>Methodology</a> · "
|
| 2296 |
+
"<code>pip install llm-cal</code>"
|
| 2297 |
+
"</div>"
|
| 2298 |
+
)
|
| 2299 |
+
|
| 2300 |
+
# When vendor changes, repopulate the GPU dropdown but PRESERVE any
|
| 2301 |
+
# cross-vendor selections (the whole point of compare mode is to
|
| 2302 |
+
# stack e.g. H800 + MI300X + 910B4 across NVIDIA/AMD/Ascend).
|
| 2303 |
+
def _on_vendor_change(v: str, current): # noqa: ANN001, ANN202
|
| 2304 |
+
gpus = _VENDOR_TO_GPUS.get(v, [])
|
| 2305 |
+
# multiselect returns list; harden against str/None for safety
|
| 2306 |
+
if isinstance(current, list):
|
| 2307 |
+
keep = list(current)
|
| 2308 |
+
elif current:
|
| 2309 |
+
keep = [current]
|
| 2310 |
+
else:
|
| 2311 |
+
keep = []
|
| 2312 |
+
# Empty selection? Seed with the first GPU so the form stays usable.
|
| 2313 |
+
if not keep:
|
| 2314 |
+
keep = [gpus[0]] if gpus else []
|
| 2315 |
+
return gr.Dropdown(choices=gpus, value=keep)
|
| 2316 |
+
|
| 2317 |
+
vendor.change(fn=_on_vendor_change, inputs=[vendor, gpu], outputs=[gpu])
|
| 2318 |
+
|
| 2319 |
+
# Click flow: instantly show "loading…", THEN run calculate.
|
| 2320 |
+
all_outputs = [output_main, output_explain, output_llm]
|
| 2321 |
+
submit.click(
|
| 2322 |
+
fn=show_loading,
|
| 2323 |
+
inputs=[lang],
|
| 2324 |
+
outputs=all_outputs,
|
| 2325 |
+
).then(
|
| 2326 |
+
fn=calculate,
|
| 2327 |
+
inputs=[
|
| 2328 |
+
model_id, gpu, engine, context_length, lang, source,
|
| 2329 |
+
gpu_count, input_tokens, output_tokens, target_tps,
|
| 2330 |
+
prefill_util, decode_bw_util, concurrency_degradation,
|
| 2331 |
+
refresh, explain, llm_review,
|
| 2332 |
+
hf_token, ms_token,
|
| 2333 |
+
llm_api_key, llm_base_url, llm_model,
|
| 2334 |
+
],
|
| 2335 |
+
outputs=all_outputs,
|
| 2336 |
+
)
|
| 2337 |
+
|
| 2338 |
+
return demo
|
| 2339 |
+
|
| 2340 |
+
|
| 2341 |
+
def _prewarm_cache() -> None:
|
| 2342 |
+
"""Fill the artifact cache for every Examples row so first-click users
|
| 2343 |
+
don't pay the 3-8s HF/MS metadata roundtrip.
|
| 2344 |
+
|
| 2345 |
+
Runs on a daemon thread alongside the Gradio server. Failures are
|
| 2346 |
+
swallowed (printed only) — pre-warm is a UX nicety, never a hard
|
| 2347 |
+
dependency. Set LLM_CAL_PREWARM=0 to disable (useful for local dev
|
| 2348 |
+
when you don't want 9 API calls every time you `python web/app.py`).
|
| 2349 |
+
"""
|
| 2350 |
+
import time
|
| 2351 |
+
|
| 2352 |
+
print(f"[prewarm] starting cache warm-up for {len(EXAMPLE_MODELS)} examples")
|
| 2353 |
+
for i, (model_id, _vendor, gpu, engine, source) in enumerate(EXAMPLE_MODELS, 1):
|
| 2354 |
+
src_key = "modelscope" if "modelscope" in source.lower() else "huggingface"
|
| 2355 |
+
label = f"{i}/{len(EXAMPLE_MODELS)} {src_key}:{model_id}"
|
| 2356 |
+
try:
|
| 2357 |
+
t0 = time.monotonic()
|
| 2358 |
+
_get_evaluator(src_key).evaluate(
|
| 2359 |
+
model_id=model_id,
|
| 2360 |
+
gpu=gpu,
|
| 2361 |
+
engine=engine,
|
| 2362 |
+
)
|
| 2363 |
+
print(f"[prewarm] {label} ok ({time.monotonic() - t0:.1f}s)")
|
| 2364 |
+
except Exception as e: # noqa: BLE001
|
| 2365 |
+
print(f"[prewarm] {label} skip — {type(e).__name__}: {e}")
|
| 2366 |
+
# Throttle to stay well under HF/MS anonymous rate limits.
|
| 2367 |
+
time.sleep(2)
|
| 2368 |
+
print("[prewarm] done")
|
| 2369 |
+
|
| 2370 |
+
|
| 2371 |
+
if __name__ == "__main__":
|
| 2372 |
+
if os.environ.get("LLM_CAL_PREWARM", "1") == "1":
|
| 2373 |
+
import threading
|
| 2374 |
+
|
| 2375 |
+
threading.Thread(target=_prewarm_cache, daemon=True).start()
|
| 2376 |
+
_build_ui().launch(theme=THEME, css=CUSTOM_CSS)
|
requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=6.0,<7.0
|
| 2 |
+
llm-cal>=0.1.3
|
src/llm_cal/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""llm-cal — LLM inference hardware calculator."""
|
| 2 |
+
|
| 3 |
+
from llm_cal.core.evaluator import Evaluator
|
| 4 |
+
from llm_cal.output.labels import Label
|
| 5 |
+
|
| 6 |
+
__all__ = ["Evaluator", "Label"]
|
src/llm_cal/architecture/__init__.py
ADDED
|
File without changes
|
src/llm_cal/architecture/detector.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""`detect()` — main orchestration over trait sub-detectors.
|
| 2 |
+
|
| 3 |
+
Step 1: Family dispatch (state_space vs transformer vs unknown).
|
| 4 |
+
Step 2: Gather traits (independent sub-detectors).
|
| 5 |
+
Step 3: Assemble Profile with a confidence level.
|
| 6 |
+
|
| 7 |
+
Fallback path: `_fallback_unknown()` for configs missing key fields. This is
|
| 8 |
+
the bedrock of "works on day-0" — new model types degrade gracefully.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
from typing import Any
|
| 14 |
+
|
| 15 |
+
from llm_cal.architecture.profile import (
|
| 16 |
+
ArchitectureProfile,
|
| 17 |
+
Confidence,
|
| 18 |
+
Family,
|
| 19 |
+
)
|
| 20 |
+
from llm_cal.architecture.traits import (
|
| 21 |
+
detect_attention,
|
| 22 |
+
detect_moe,
|
| 23 |
+
detect_position,
|
| 24 |
+
detect_sliding_window,
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
# Model types we know we handle well. Maintained alongside engine_compat matrix.
|
| 28 |
+
KNOWN_MODEL_TYPES: frozenset[str] = frozenset(
|
| 29 |
+
{
|
| 30 |
+
"llama",
|
| 31 |
+
"mistral",
|
| 32 |
+
"mixtral",
|
| 33 |
+
"qwen2",
|
| 34 |
+
"qwen2_moe",
|
| 35 |
+
"qwen3",
|
| 36 |
+
"qwen3_moe",
|
| 37 |
+
"deepseek_v2",
|
| 38 |
+
"deepseek_v3",
|
| 39 |
+
"deepseek_v3_2",
|
| 40 |
+
"deepseek_v4",
|
| 41 |
+
"gemma",
|
| 42 |
+
"gemma2",
|
| 43 |
+
"gemma3",
|
| 44 |
+
"phi",
|
| 45 |
+
"phi3",
|
| 46 |
+
}
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
STATE_SPACE_TYPES: frozenset[str] = frozenset({"mamba", "mamba2", "falcon_mamba", "jamba"})
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def detect(config: dict[str, Any]) -> ArchitectureProfile:
|
| 53 |
+
"""Main entry. Given a parsed config.json dict, return an ArchitectureProfile."""
|
| 54 |
+
model_type = str(config.get("model_type", "")).lower()
|
| 55 |
+
|
| 56 |
+
# Step 1: state_space family short-circuits — v0.1 unsupported, but we identify it
|
| 57 |
+
if model_type in STATE_SPACE_TYPES or "ssm_cfg" in config:
|
| 58 |
+
return ArchitectureProfile(
|
| 59 |
+
model_type=model_type,
|
| 60 |
+
architectures=tuple(str(a).lower() for a in config.get("architectures", [])),
|
| 61 |
+
family=Family.STATE_SPACE,
|
| 62 |
+
num_hidden_layers=int(config.get("num_hidden_layers", 0)),
|
| 63 |
+
hidden_size=int(config.get("hidden_size", 0)),
|
| 64 |
+
vocab_size=int(config.get("vocab_size", 0)),
|
| 65 |
+
confidence=Confidence.HIGH,
|
| 66 |
+
auxiliary={"v0_1_unsupported": True},
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
# Step 2: reject if fundamentally unidentifiable
|
| 70 |
+
if not model_type and not config.get("architectures"):
|
| 71 |
+
return _fallback_unknown(config)
|
| 72 |
+
|
| 73 |
+
# Step 3: required fields
|
| 74 |
+
num_layers = config.get("num_hidden_layers")
|
| 75 |
+
hidden_size = config.get("hidden_size")
|
| 76 |
+
if not num_layers or not hidden_size:
|
| 77 |
+
return _fallback_unknown(config)
|
| 78 |
+
|
| 79 |
+
# Step 4: gather traits (each is independent and may return None)
|
| 80 |
+
attention = detect_attention(config)
|
| 81 |
+
moe = detect_moe(config)
|
| 82 |
+
position = detect_position(config)
|
| 83 |
+
sliding = detect_sliding_window(config)
|
| 84 |
+
|
| 85 |
+
# Step 5: confidence — HIGH iff model_type is in the registry
|
| 86 |
+
confidence = Confidence.HIGH if model_type in KNOWN_MODEL_TYPES else Confidence.MEDIUM
|
| 87 |
+
|
| 88 |
+
# Pass-through of config fields our formulas can use downstream. Keeps the
|
| 89 |
+
# Profile schema stable while enabling richer computation (e.g. dense FFN
|
| 90 |
+
# param count needs intermediate_size).
|
| 91 |
+
auxiliary: dict[str, object] = {}
|
| 92 |
+
if isinstance(config.get("intermediate_size"), int):
|
| 93 |
+
auxiliary["intermediate_size"] = config["intermediate_size"]
|
| 94 |
+
if config.get("tie_word_embeddings") is not None:
|
| 95 |
+
auxiliary["tie_word_embeddings"] = bool(config["tie_word_embeddings"])
|
| 96 |
+
|
| 97 |
+
return ArchitectureProfile(
|
| 98 |
+
model_type=model_type,
|
| 99 |
+
architectures=tuple(str(a).lower() for a in config.get("architectures", [])),
|
| 100 |
+
family=Family.TRANSFORMER,
|
| 101 |
+
num_hidden_layers=int(num_layers),
|
| 102 |
+
hidden_size=int(hidden_size),
|
| 103 |
+
vocab_size=int(config.get("vocab_size", 0)),
|
| 104 |
+
confidence=confidence,
|
| 105 |
+
attention=attention,
|
| 106 |
+
moe=moe,
|
| 107 |
+
position=position,
|
| 108 |
+
sliding_window=sliding,
|
| 109 |
+
auxiliary=auxiliary,
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def _fallback_unknown(config: dict[str, Any]) -> ArchitectureProfile:
|
| 114 |
+
"""Graceful degradation when config.json is unusable.
|
| 115 |
+
|
| 116 |
+
Still returns a valid Profile. Consumers check `family == Family.UNKNOWN`
|
| 117 |
+
or `confidence == Confidence.LOW` and skip KV-cache estimation accordingly.
|
| 118 |
+
"""
|
| 119 |
+
return ArchitectureProfile(
|
| 120 |
+
model_type=str(config.get("model_type", "")).lower(),
|
| 121 |
+
architectures=tuple(str(a).lower() for a in config.get("architectures", [])),
|
| 122 |
+
family=Family.UNKNOWN,
|
| 123 |
+
num_hidden_layers=int(config.get("num_hidden_layers", 0)),
|
| 124 |
+
hidden_size=int(config.get("hidden_size", 0)),
|
| 125 |
+
vocab_size=int(config.get("vocab_size", 0)),
|
| 126 |
+
confidence=Confidence.LOW,
|
| 127 |
+
auxiliary={
|
| 128 |
+
"warning": (
|
| 129 |
+
"No recognizable model_type or missing essential config fields. "
|
| 130 |
+
"Weight estimate from safetensors file size only; "
|
| 131 |
+
"KV cache cannot be estimated; engine compatibility unknown."
|
| 132 |
+
)
|
| 133 |
+
},
|
| 134 |
+
)
|
src/llm_cal/architecture/formulas/__init__.py
ADDED
|
File without changes
|
src/llm_cal/architecture/formulas/kv_cache.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""KV cache estimation — traits-composed formula.
|
| 2 |
+
|
| 3 |
+
The formula is NOT owned by a single architecture module. Instead we compose it
|
| 4 |
+
from the traits on `ArchitectureProfile`:
|
| 5 |
+
|
| 6 |
+
baseline = 2 (K+V) * num_kv_heads * head_dim * seq_len * dtype_bytes * num_layers
|
| 7 |
+
|
| 8 |
+
Then apply compositional modifiers:
|
| 9 |
+
* MLA: baseline uses kv_lora_rank instead of num_kv_heads * head_dim
|
| 10 |
+
(DeepSeek's compressed KV representation)
|
| 11 |
+
* CSA_HCA: multiply by an effective-ratio derived from compress_ratios
|
| 12 |
+
(most layers are heavily compressed, a few are dense)
|
| 13 |
+
* Sliding window: cap `seq_len` at the window size
|
| 14 |
+
* NSA: multiply by (nsa_topk / seq_len), clamped — sparse attention
|
| 15 |
+
keeps only top-k keys
|
| 16 |
+
|
| 17 |
+
Returns AnnotatedValue tagged [estimated] unless we can't compute it at all.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
from __future__ import annotations
|
| 21 |
+
|
| 22 |
+
from llm_cal.architecture.profile import (
|
| 23 |
+
ArchitectureProfile,
|
| 24 |
+
AttentionTraits,
|
| 25 |
+
Confidence,
|
| 26 |
+
Family,
|
| 27 |
+
)
|
| 28 |
+
from llm_cal.output.labels import AnnotatedValue, Label
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def compute_kv_cache_bytes(
|
| 32 |
+
profile: ArchitectureProfile,
|
| 33 |
+
seq_len: int,
|
| 34 |
+
dtype_bytes: int = 2, # BF16/FP16 default
|
| 35 |
+
) -> AnnotatedValue[int]:
|
| 36 |
+
"""KV cache per single request at `seq_len` tokens.
|
| 37 |
+
|
| 38 |
+
Returns AnnotatedValue. The label tells the user whether we could compute it
|
| 39 |
+
at all.
|
| 40 |
+
"""
|
| 41 |
+
if seq_len <= 0:
|
| 42 |
+
return AnnotatedValue(0, Label.ESTIMATED, source="seq_len <= 0")
|
| 43 |
+
|
| 44 |
+
if profile.family == Family.STATE_SPACE:
|
| 45 |
+
return AnnotatedValue(
|
| 46 |
+
0,
|
| 47 |
+
Label.UNKNOWN,
|
| 48 |
+
source="state-space model has no KV cache concept",
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
if profile.family == Family.UNKNOWN or profile.confidence == Confidence.LOW:
|
| 52 |
+
return AnnotatedValue(
|
| 53 |
+
0,
|
| 54 |
+
Label.UNKNOWN,
|
| 55 |
+
source="unknown architecture — cannot estimate KV cache",
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
if profile.attention is None or profile.num_hidden_layers <= 0:
|
| 59 |
+
return AnnotatedValue(
|
| 60 |
+
0,
|
| 61 |
+
Label.UNKNOWN,
|
| 62 |
+
source="missing attention traits or layer count",
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
attn = profile.attention
|
| 66 |
+
n_layers = profile.num_hidden_layers
|
| 67 |
+
|
| 68 |
+
# Step 1: effective seq_len.
|
| 69 |
+
# Sliding window applies ONLY to standard attention (MHA/GQA/MQA). For
|
| 70 |
+
# explicitly-sparse variants (CSA_HCA, NSA), the sparse mechanism already
|
| 71 |
+
# encodes per-layer reduction; stacking sliding cap would double-count and
|
| 72 |
+
# produce absurdly small estimates (measured 1000x too low on DeepSeek-V4).
|
| 73 |
+
effective_seq = seq_len
|
| 74 |
+
sliding_note = ""
|
| 75 |
+
is_sparse_variant = attn.variant in ("CSA_HCA", "NSA")
|
| 76 |
+
if profile.sliding_window and profile.sliding_window > 0 and not is_sparse_variant:
|
| 77 |
+
effective_seq = min(seq_len, profile.sliding_window)
|
| 78 |
+
if effective_seq < seq_len:
|
| 79 |
+
sliding_note = (
|
| 80 |
+
f" (sliding_window={profile.sliding_window} caps {seq_len} -> {effective_seq})"
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
# Step 2: per-layer per-token cache size
|
| 84 |
+
per_layer_per_token = _per_layer_per_token_bytes(attn, dtype_bytes)
|
| 85 |
+
|
| 86 |
+
# Step 3: baseline for the full layer stack
|
| 87 |
+
baseline = per_layer_per_token * effective_seq * n_layers
|
| 88 |
+
|
| 89 |
+
# Step 4: compositional modifier for sparse attention
|
| 90 |
+
result_bytes = baseline
|
| 91 |
+
variant_note: str = str(attn.variant)
|
| 92 |
+
|
| 93 |
+
if attn.variant == "CSA_HCA" and attn.compress_ratios:
|
| 94 |
+
ratio = _average_csa_hca_ratio(attn.compress_ratios)
|
| 95 |
+
result_bytes = int(baseline * ratio)
|
| 96 |
+
variant_note = f"{variant_note} (avg compress ratio {ratio:.3f})"
|
| 97 |
+
|
| 98 |
+
if attn.variant == "NSA" and attn.nsa_topk and attn.nsa_topk > 0:
|
| 99 |
+
sparsity = min(1.0, attn.nsa_topk / effective_seq)
|
| 100 |
+
result_bytes = int(baseline * sparsity)
|
| 101 |
+
variant_note = f"{variant_note} (nsa_topk={attn.nsa_topk}, sparsity={sparsity:.3f})"
|
| 102 |
+
|
| 103 |
+
return AnnotatedValue(
|
| 104 |
+
result_bytes,
|
| 105 |
+
Label.ESTIMATED,
|
| 106 |
+
source=(
|
| 107 |
+
f"{variant_note}: 2*kv_shape*{dtype_bytes}B*{effective_seq}*{n_layers}{sliding_note}"
|
| 108 |
+
),
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def _per_layer_per_token_bytes(attn: AttentionTraits, dtype_bytes: int) -> int:
|
| 113 |
+
"""Bytes of K+V storage per token per layer, given attention shape."""
|
| 114 |
+
# MLA: KV is compressed into a single latent vector of size kv_lora_rank.
|
| 115 |
+
# (Both K and V share it; it's NOT 2 * kv_lora_rank.)
|
| 116 |
+
if attn.variant == "MLA" and attn.kv_lora_rank:
|
| 117 |
+
return attn.kv_lora_rank * dtype_bytes
|
| 118 |
+
|
| 119 |
+
# Standard / GQA / MQA / CSA+HCA (the sparse scaling is applied later).
|
| 120 |
+
# K and V both stored: factor of 2.
|
| 121 |
+
return 2 * attn.num_kv_heads * attn.head_dim * dtype_bytes
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def _average_csa_hca_ratio(compress_ratios: tuple[int, ...]) -> float:
|
| 125 |
+
"""DeepSeek V4 compress_ratios semantics:
|
| 126 |
+
|
| 127 |
+
0 -> dense attention (keep 100%)
|
| 128 |
+
N>0 -> keep 1/N of tokens
|
| 129 |
+
|
| 130 |
+
Returns the average "keep fraction" across all layers.
|
| 131 |
+
|
| 132 |
+
Example: ratios = [0, 0, 4, 128, 4, 128, ...]
|
| 133 |
+
- two dense layers (fraction = 1.0)
|
| 134 |
+
- remaining alternating 1/4 and 1/128
|
| 135 |
+
- weighted average across all layers
|
| 136 |
+
"""
|
| 137 |
+
if not compress_ratios:
|
| 138 |
+
return 1.0
|
| 139 |
+
total_fraction = 0.0
|
| 140 |
+
for r in compress_ratios:
|
| 141 |
+
if r == 0:
|
| 142 |
+
total_fraction += 1.0
|
| 143 |
+
else:
|
| 144 |
+
total_fraction += 1.0 / r
|
| 145 |
+
return total_fraction / len(compress_ratios)
|
src/llm_cal/architecture/formulas/weight.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Weight count estimation — total parameters and total bytes by assumption.
|
| 2 |
+
|
| 3 |
+
Two distinct purposes, kept separate by label:
|
| 4 |
+
* estimate_total_params(profile) -> [estimated] param count
|
| 5 |
+
* predicted_bytes_under_quant(params, scheme) -> [estimated] bytes
|
| 6 |
+
|
| 7 |
+
The weight_analyzer/reconciler compares predicted_bytes against observed file
|
| 8 |
+
sizes to identify the actual quantization scheme. That's the DeepSeek-V4 story.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
from llm_cal.architecture.profile import ArchitectureProfile
|
| 14 |
+
from llm_cal.output.labels import AnnotatedValue, Label
|
| 15 |
+
from llm_cal.weight_analyzer import _QUANT_BPP, QuantizationScheme
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def estimate_total_params(profile: ArchitectureProfile) -> AnnotatedValue[int]:
|
| 19 |
+
"""Rough param count from Profile.
|
| 20 |
+
|
| 21 |
+
Core components (transformer block):
|
| 22 |
+
- Embedding: vocab_size * hidden_size (+ output head if not tied)
|
| 23 |
+
- Per-layer attention: depends on variant
|
| 24 |
+
- Per-layer FFN: depends on dense vs MoE
|
| 25 |
+
|
| 26 |
+
Returns [estimated] — this is an arithmetic sum over config values, several
|
| 27 |
+
simplifying assumptions (e.g. RMSNorm gamma counted in overhead).
|
| 28 |
+
"""
|
| 29 |
+
if profile.num_hidden_layers <= 0 or profile.hidden_size <= 0:
|
| 30 |
+
return AnnotatedValue(0, Label.UNKNOWN, source="insufficient shape info in profile")
|
| 31 |
+
|
| 32 |
+
hidden = profile.hidden_size
|
| 33 |
+
n_layers = profile.num_hidden_layers
|
| 34 |
+
vocab = profile.vocab_size
|
| 35 |
+
|
| 36 |
+
# Embedding + output head. When weights are tied (Gemma, some Llamas),
|
| 37 |
+
# the output head IS the embedding — don't count twice.
|
| 38 |
+
embed_params = vocab * hidden
|
| 39 |
+
tied = bool(profile.auxiliary.get("tie_word_embeddings", False))
|
| 40 |
+
output_head_params = 0 if tied else vocab * hidden
|
| 41 |
+
|
| 42 |
+
# Per-layer attention projections.
|
| 43 |
+
attn_params = _attention_params(profile)
|
| 44 |
+
|
| 45 |
+
# Per-layer FFN (dense path) OR MoE expert block.
|
| 46 |
+
ffn_params = _ffn_params(profile)
|
| 47 |
+
|
| 48 |
+
# Per-layer LayerNorms (2 of them, one scalar per feature).
|
| 49 |
+
norm_params = 2 * hidden
|
| 50 |
+
|
| 51 |
+
per_layer = attn_params + ffn_params + norm_params
|
| 52 |
+
total = embed_params + output_head_params + per_layer * n_layers
|
| 53 |
+
|
| 54 |
+
return AnnotatedValue(
|
| 55 |
+
total,
|
| 56 |
+
Label.ESTIMATED,
|
| 57 |
+
source=(
|
| 58 |
+
f"{vocab} vocab * {hidden} hidden * 2 (embed+head) + "
|
| 59 |
+
f"{n_layers} layers * ({attn_params:,} attn + {ffn_params:,} ffn + norms)"
|
| 60 |
+
),
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def _attention_params(profile: ArchitectureProfile) -> int:
|
| 65 |
+
"""Parameter count for attention projections (Q/K/V/O) in one layer."""
|
| 66 |
+
attn = profile.attention
|
| 67 |
+
if attn is None:
|
| 68 |
+
return 0
|
| 69 |
+
hidden = profile.hidden_size
|
| 70 |
+
|
| 71 |
+
# MLA uses low-rank projections — very different shape.
|
| 72 |
+
if attn.variant == "MLA" and attn.q_lora_rank:
|
| 73 |
+
q_lora = attn.q_lora_rank
|
| 74 |
+
kv_lora = attn.kv_lora_rank or attn.q_lora_rank # approximate
|
| 75 |
+
# W_q_down + W_q_up + W_kv_down + W_kv_up + W_o_down + W_o_up
|
| 76 |
+
head_total = attn.num_heads * attn.head_dim
|
| 77 |
+
return (
|
| 78 |
+
hidden * q_lora # Q down
|
| 79 |
+
+ q_lora * head_total # Q up
|
| 80 |
+
+ hidden * kv_lora * 2 # K+V down (shared)
|
| 81 |
+
+ kv_lora * head_total # K+V up
|
| 82 |
+
+ head_total * q_lora # O down (reuse q_lora as o_lora approx)
|
| 83 |
+
+ q_lora * hidden # O up
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
# Standard/GQA/MQA: Q + K + V + O projections
|
| 87 |
+
q_out = attn.num_heads * attn.head_dim
|
| 88 |
+
kv_out = attn.num_kv_heads * attn.head_dim
|
| 89 |
+
return hidden * q_out + hidden * kv_out * 2 + q_out * hidden
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def _ffn_params(profile: ArchitectureProfile) -> int:
|
| 93 |
+
"""Parameter count for the FFN (MoE or dense) in one layer.
|
| 94 |
+
|
| 95 |
+
For MoE, counts all experts (routed + shared) because they all live in memory.
|
| 96 |
+
Active parameters per token is a different metric (not our job here).
|
| 97 |
+
"""
|
| 98 |
+
hidden = profile.hidden_size
|
| 99 |
+
|
| 100 |
+
if profile.moe is not None:
|
| 101 |
+
moe = profile.moe
|
| 102 |
+
# SwiGLU-style expert: 3 matrices (gate, up, down), each hidden x moe_intermediate.
|
| 103 |
+
single_expert = 3 * hidden * moe.moe_intermediate_size
|
| 104 |
+
total_experts = moe.num_routed_experts + moe.num_shared_experts
|
| 105 |
+
# Router: hidden x num_routed_experts
|
| 106 |
+
router = hidden * moe.num_routed_experts
|
| 107 |
+
return single_expert * total_experts + router
|
| 108 |
+
|
| 109 |
+
# Dense: try to read intermediate_size from auxiliary; fallback to 4 * hidden.
|
| 110 |
+
intermediate = profile.auxiliary.get("intermediate_size")
|
| 111 |
+
if not isinstance(intermediate, int) or intermediate <= 0:
|
| 112 |
+
intermediate = 4 * hidden
|
| 113 |
+
# SwiGLU: 3 matrices
|
| 114 |
+
return 3 * hidden * intermediate
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def predicted_bytes_under_quant(
|
| 118 |
+
total_params: int, scheme: QuantizationScheme
|
| 119 |
+
) -> AnnotatedValue[int]:
|
| 120 |
+
"""How many bytes `total_params` would occupy under a given quantization."""
|
| 121 |
+
bpp = _QUANT_BPP.get(scheme, 0.0)
|
| 122 |
+
if bpp == 0.0:
|
| 123 |
+
return AnnotatedValue(
|
| 124 |
+
0,
|
| 125 |
+
Label.UNKNOWN,
|
| 126 |
+
source=f"no bytes-per-param mapping for {scheme}",
|
| 127 |
+
)
|
| 128 |
+
predicted = int(total_params * bpp)
|
| 129 |
+
return AnnotatedValue(
|
| 130 |
+
predicted,
|
| 131 |
+
Label.ESTIMATED,
|
| 132 |
+
source=f"{total_params:,} params * {bpp} bytes/param ({scheme})",
|
| 133 |
+
)
|
src/llm_cal/architecture/profile.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""ArchitectureProfile — the core data class the whole tool orbits.
|
| 2 |
+
|
| 3 |
+
Key insight: an architecture is NOT a single label. It's a combination of independent
|
| 4 |
+
traits that co-exist on a Profile. DeepSeek-V3.2 = MoE + MLA + NSA — three traits.
|
| 5 |
+
Single-module dispatch cannot express this; traits composition can.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
from dataclasses import dataclass, field
|
| 11 |
+
from enum import StrEnum
|
| 12 |
+
from typing import Literal
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class Family(StrEnum):
|
| 16 |
+
TRANSFORMER = "transformer"
|
| 17 |
+
STATE_SPACE = "state_space" # Mamba, etc. — v0.1 unsupported
|
| 18 |
+
UNKNOWN = "unknown"
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class Confidence(StrEnum):
|
| 22 |
+
HIGH = "high" # model_type in KNOWN_MODEL_TYPES, all fields present
|
| 23 |
+
MEDIUM = "medium" # model_type unknown but architectures[] or config partial
|
| 24 |
+
LOW = "low" # fallback path, config.json missing or malformed
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
AttentionVariant = Literal["MHA", "GQA", "MQA", "MLA", "NSA", "CSA_HCA"]
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@dataclass(frozen=True)
|
| 31 |
+
class AttentionTraits:
|
| 32 |
+
"""Attention layer shape. Populated by `detect_attention()`."""
|
| 33 |
+
|
| 34 |
+
variant: AttentionVariant
|
| 35 |
+
num_heads: int
|
| 36 |
+
num_kv_heads: int
|
| 37 |
+
head_dim: int
|
| 38 |
+
# MLA-specific (DeepSeek V2+)
|
| 39 |
+
q_lora_rank: int | None = None
|
| 40 |
+
kv_lora_rank: int | None = None
|
| 41 |
+
# Sparse attention (CSA+HCA per DeepSeek V4)
|
| 42 |
+
compress_ratios: tuple[int, ...] | None = None
|
| 43 |
+
# Sparse attention (NSA per DeepSeek V3.2)
|
| 44 |
+
nsa_topk: int | None = None
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
@dataclass(frozen=True)
|
| 48 |
+
class MoETraits:
|
| 49 |
+
"""MoE-specific layer shape. None on Profile means dense."""
|
| 50 |
+
|
| 51 |
+
num_routed_experts: int
|
| 52 |
+
num_shared_experts: int
|
| 53 |
+
num_experts_per_tok: int
|
| 54 |
+
moe_intermediate_size: int
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
@dataclass(frozen=True)
|
| 58 |
+
class PositionTraits:
|
| 59 |
+
"""RoPE / YaRN / AliBi / none."""
|
| 60 |
+
|
| 61 |
+
rope_type: Literal["rope", "yarn", "alibi", "none"] = "rope"
|
| 62 |
+
rope_theta: float | None = None
|
| 63 |
+
rope_scaling_factor: float | None = None
|
| 64 |
+
max_position_embeddings: int | None = None
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
@dataclass(frozen=True)
|
| 68 |
+
class ArchitectureProfile:
|
| 69 |
+
"""Complete architectural snapshot of a model.
|
| 70 |
+
|
| 71 |
+
This drives weight/KV-cache formulas, engine matching, and fleet planning.
|
| 72 |
+
"""
|
| 73 |
+
|
| 74 |
+
model_type: str # config.json's `model_type` (lowercase)
|
| 75 |
+
architectures: tuple[str, ...] # config.json's `architectures[]`
|
| 76 |
+
family: Family
|
| 77 |
+
num_hidden_layers: int
|
| 78 |
+
hidden_size: int
|
| 79 |
+
vocab_size: int
|
| 80 |
+
confidence: Confidence
|
| 81 |
+
# Traits (composable — not all populated)
|
| 82 |
+
attention: AttentionTraits | None = None
|
| 83 |
+
moe: MoETraits | None = None
|
| 84 |
+
position: PositionTraits | None = None
|
| 85 |
+
sliding_window: int | None = None # None = no window
|
| 86 |
+
# Pass-through for traits we haven't categorised yet
|
| 87 |
+
auxiliary: dict[str, object] = field(default_factory=dict)
|
| 88 |
+
|
| 89 |
+
@property
|
| 90 |
+
def is_moe(self) -> bool:
|
| 91 |
+
return self.moe is not None
|
| 92 |
+
|
| 93 |
+
@property
|
| 94 |
+
def is_sparse_attention(self) -> bool:
|
| 95 |
+
if self.attention is None:
|
| 96 |
+
return False
|
| 97 |
+
return self.attention.variant in ("NSA", "CSA_HCA")
|
src/llm_cal/architecture/traits.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Independent trait sub-detectors.
|
| 2 |
+
|
| 3 |
+
Each function inspects config.json and returns a trait dataclass (or None).
|
| 4 |
+
They co-exist: a MoE+MLA+CSA_HCA model matches all three.
|
| 5 |
+
|
| 6 |
+
Dispatch order inside `detect_attention()` is critical because some keys are
|
| 7 |
+
ambiguous (e.g. num_kv_heads < num_heads can be GQA OR a side-effect of MLA
|
| 8 |
+
where there's a single compressed KV head).
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
from typing import Any
|
| 14 |
+
|
| 15 |
+
from llm_cal.architecture.profile import (
|
| 16 |
+
AttentionTraits,
|
| 17 |
+
MoETraits,
|
| 18 |
+
PositionTraits,
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def detect_moe(config: dict[str, Any]) -> MoETraits | None:
|
| 23 |
+
"""MoE detection — presence of any routed-expert key signals MoE."""
|
| 24 |
+
routed = (
|
| 25 |
+
config.get("n_routed_experts")
|
| 26 |
+
or config.get("num_local_experts")
|
| 27 |
+
or config.get("num_experts")
|
| 28 |
+
)
|
| 29 |
+
if not routed:
|
| 30 |
+
return None
|
| 31 |
+
|
| 32 |
+
return MoETraits(
|
| 33 |
+
num_routed_experts=int(routed),
|
| 34 |
+
num_shared_experts=int(config.get("n_shared_experts", 0)),
|
| 35 |
+
num_experts_per_tok=int(
|
| 36 |
+
config.get("num_experts_per_tok") or config.get("num_experts_per_token", 1)
|
| 37 |
+
),
|
| 38 |
+
moe_intermediate_size=int(
|
| 39 |
+
config.get("moe_intermediate_size") or config.get("intermediate_size", 0)
|
| 40 |
+
),
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def detect_attention(config: dict[str, Any]) -> AttentionTraits:
|
| 45 |
+
"""Attention variant detection — order-sensitive.
|
| 46 |
+
|
| 47 |
+
Priority (first match wins on variant, but shape fields always populated):
|
| 48 |
+
1. CSA+HCA: compress_ratios array, length matches num_hidden_layers
|
| 49 |
+
2. NSA: nsa_config / sparse_attention_cfg present
|
| 50 |
+
3. MLA: q_lora_rank OR kv_lora_rank present
|
| 51 |
+
4. GQA/MQA: num_kv_heads < num_heads
|
| 52 |
+
5. MHA: default
|
| 53 |
+
"""
|
| 54 |
+
num_heads = int(config.get("num_attention_heads", 1))
|
| 55 |
+
num_kv_heads = int(config.get("num_key_value_heads", num_heads))
|
| 56 |
+
head_dim = int(config.get("head_dim") or (config.get("hidden_size", 0) // num_heads or 1))
|
| 57 |
+
num_layers = int(config.get("num_hidden_layers", 0))
|
| 58 |
+
|
| 59 |
+
q_lora = config.get("q_lora_rank")
|
| 60 |
+
kv_lora = config.get("kv_lora_rank")
|
| 61 |
+
compress_ratios = config.get("compress_ratios")
|
| 62 |
+
has_nsa = "nsa_config" in config or "sparse_attention_cfg" in config
|
| 63 |
+
|
| 64 |
+
# CSA+HCA: length check guards against future variants that happen to use the
|
| 65 |
+
# same key name with different semantics. Reviewer flagged this.
|
| 66 |
+
# Accepted lengths:
|
| 67 |
+
# - num_hidden_layers
|
| 68 |
+
# - num_hidden_layers + num_nextn_predict_layers (DeepSeek MTP: one extra
|
| 69 |
+
# ratio for the next-token prediction head)
|
| 70 |
+
nextn = int(config.get("num_nextn_predict_layers", 0))
|
| 71 |
+
accepted_lengths = {num_layers, num_layers + nextn} if num_layers > 0 else set()
|
| 72 |
+
if (
|
| 73 |
+
isinstance(compress_ratios, list)
|
| 74 |
+
and num_layers > 0
|
| 75 |
+
and len(compress_ratios) in accepted_lengths
|
| 76 |
+
):
|
| 77 |
+
return AttentionTraits(
|
| 78 |
+
variant="CSA_HCA",
|
| 79 |
+
num_heads=num_heads,
|
| 80 |
+
num_kv_heads=num_kv_heads,
|
| 81 |
+
head_dim=head_dim,
|
| 82 |
+
q_lora_rank=int(q_lora) if q_lora else None,
|
| 83 |
+
kv_lora_rank=int(kv_lora) if kv_lora else None,
|
| 84 |
+
compress_ratios=tuple(compress_ratios),
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
if has_nsa:
|
| 88 |
+
nsa_cfg = config.get("nsa_config") or config.get("sparse_attention_cfg", {})
|
| 89 |
+
nsa_topk = None
|
| 90 |
+
if isinstance(nsa_cfg, dict):
|
| 91 |
+
nsa_topk = nsa_cfg.get("topk") or nsa_cfg.get("index_topk")
|
| 92 |
+
return AttentionTraits(
|
| 93 |
+
variant="NSA",
|
| 94 |
+
num_heads=num_heads,
|
| 95 |
+
num_kv_heads=num_kv_heads,
|
| 96 |
+
head_dim=head_dim,
|
| 97 |
+
nsa_topk=int(nsa_topk) if nsa_topk else None,
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
if q_lora or kv_lora:
|
| 101 |
+
return AttentionTraits(
|
| 102 |
+
variant="MLA",
|
| 103 |
+
num_heads=num_heads,
|
| 104 |
+
num_kv_heads=num_kv_heads,
|
| 105 |
+
head_dim=head_dim,
|
| 106 |
+
q_lora_rank=int(q_lora) if q_lora else None,
|
| 107 |
+
kv_lora_rank=int(kv_lora) if kv_lora else None,
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
if num_kv_heads < num_heads:
|
| 111 |
+
variant = "MQA" if num_kv_heads == 1 else "GQA"
|
| 112 |
+
return AttentionTraits(
|
| 113 |
+
variant=variant, # type: ignore[arg-type]
|
| 114 |
+
num_heads=num_heads,
|
| 115 |
+
num_kv_heads=num_kv_heads,
|
| 116 |
+
head_dim=head_dim,
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
return AttentionTraits(
|
| 120 |
+
variant="MHA",
|
| 121 |
+
num_heads=num_heads,
|
| 122 |
+
num_kv_heads=num_kv_heads,
|
| 123 |
+
head_dim=head_dim,
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def detect_position(config: dict[str, Any]) -> PositionTraits:
|
| 128 |
+
rope_scaling = config.get("rope_scaling") or {}
|
| 129 |
+
rope_type = (rope_scaling.get("type") or rope_scaling.get("rope_type") or "rope").lower()
|
| 130 |
+
if rope_type not in ("rope", "yarn", "alibi", "none"):
|
| 131 |
+
rope_type = "rope"
|
| 132 |
+
|
| 133 |
+
return PositionTraits(
|
| 134 |
+
rope_type=rope_type, # type: ignore[arg-type]
|
| 135 |
+
rope_theta=float(config["rope_theta"]) if config.get("rope_theta") else None,
|
| 136 |
+
rope_scaling_factor=(float(rope_scaling["factor"]) if rope_scaling.get("factor") else None),
|
| 137 |
+
max_position_embeddings=(
|
| 138 |
+
int(config["max_position_embeddings"])
|
| 139 |
+
if config.get("max_position_embeddings")
|
| 140 |
+
else None
|
| 141 |
+
),
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def detect_sliding_window(config: dict[str, Any]) -> int | None:
|
| 146 |
+
"""Return window size if sliding-window attention is used, else None."""
|
| 147 |
+
sw = config.get("sliding_window")
|
| 148 |
+
if sw is None or sw == 0:
|
| 149 |
+
return None
|
| 150 |
+
return int(sw)
|
src/llm_cal/benchmark/__init__.py
ADDED
|
File without changes
|
src/llm_cal/benchmark/dataset.yaml
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Reference benchmark dataset — curated anchor points for validating llm-cal
|
| 2 |
+
# output against publicly-known values.
|
| 3 |
+
#
|
| 4 |
+
# This is NOT a synthetic benchmark. Each entry cites where the expected
|
| 5 |
+
# values came from — HF API, model card text, vLLM/SGLang recipe, or
|
| 6 |
+
# hand computation in the design doc. If you add an entry, cite sources.
|
| 7 |
+
#
|
| 8 |
+
# The runner (`llm-cal benchmark`) fetches each model's live config and
|
| 9 |
+
# compares the tool's output against these expectations. Failures mean
|
| 10 |
+
# either the tool drifted or the reference data is stale.
|
| 11 |
+
schema_version: 1
|
| 12 |
+
entries:
|
| 13 |
+
# ------------------------------------------------------------
|
| 14 |
+
# Signature case — DeepSeek-V4-Flash. Every claim here is the
|
| 15 |
+
# reason this tool exists (vs gpu_poor's naive FP8 assumption).
|
| 16 |
+
# ------------------------------------------------------------
|
| 17 |
+
- name: "DeepSeek-V4-Flash on 8x H800 (tool's reference case)"
|
| 18 |
+
model_id: deepseek-ai/DeepSeek-V4-Flash
|
| 19 |
+
gpu: H800
|
| 20 |
+
engine: vllm
|
| 21 |
+
expectations:
|
| 22 |
+
- field: attention_variant
|
| 23 |
+
expected: CSA_HCA
|
| 24 |
+
source: "config.json compress_ratios length=44 matches n_layers+n_mtp"
|
| 25 |
+
- field: quantization
|
| 26 |
+
expected: FP4_FP8_MIXED
|
| 27 |
+
source: "HF model card: 'FP4 + FP8 Mixed: MoE experts FP4, others FP8'"
|
| 28 |
+
- field: weight_bytes
|
| 29 |
+
expected_min: 158_000_000_000
|
| 30 |
+
expected_max: 162_000_000_000
|
| 31 |
+
source: "HF siblings API (46x ~3.57 GB safetensors shards ≈ 160 GB)"
|
| 32 |
+
- field: fleet_prod_gpus
|
| 33 |
+
expected: 8
|
| 34 |
+
source: "Design doc hand computation: 8x H800 for prod-scale concurrency"
|
| 35 |
+
- field: is_moe
|
| 36 |
+
expected: true
|
| 37 |
+
source: "config.json n_routed_experts=256"
|
| 38 |
+
|
| 39 |
+
# ------------------------------------------------------------
|
| 40 |
+
# Dense GQA — Qwen2.5-72B. Validates:
|
| 41 |
+
# - dense (no MoE) detection
|
| 42 |
+
# - BF16/FP16 quantization path
|
| 43 |
+
# - GQA KV sharding math (critical for Llama-family models)
|
| 44 |
+
# ------------------------------------------------------------
|
| 45 |
+
- name: "Qwen2.5-72B on 8x H100 (GQA reference)"
|
| 46 |
+
model_id: Qwen/Qwen2.5-72B-Instruct
|
| 47 |
+
gpu: H100
|
| 48 |
+
engine: vllm
|
| 49 |
+
expectations:
|
| 50 |
+
- field: attention_variant
|
| 51 |
+
expected: GQA
|
| 52 |
+
source: "config.json num_kv_heads=8 < num_attention_heads=64"
|
| 53 |
+
- field: quantization
|
| 54 |
+
expected: FP16
|
| 55 |
+
source: "config.json torch_dtype=bfloat16, no quantization_config"
|
| 56 |
+
- field: weight_bytes
|
| 57 |
+
expected_min: 140_000_000_000
|
| 58 |
+
expected_max: 150_000_000_000
|
| 59 |
+
source: "HF siblings API — 72.7B params × 2 bytes ≈ 145 GB"
|
| 60 |
+
- field: is_moe
|
| 61 |
+
expected: false
|
| 62 |
+
source: "config.json has no n_routed_experts / num_local_experts"
|
| 63 |
+
- field: fleet_prod_gpus_at_most
|
| 64 |
+
expected: 8
|
| 65 |
+
source: "Weights fit on 8x H100 (145 GB / 8 ≈ 18 GB per GPU)"
|
| 66 |
+
|
| 67 |
+
# ------------------------------------------------------------
|
| 68 |
+
# DeepSeek-V3 (classic MoE + MLA, not V3.2's NSA) — validates MLA detection
|
| 69 |
+
# ------------------------------------------------------------
|
| 70 |
+
- name: "DeepSeek-V3 on H800 (MoE+MLA, no sparse attention)"
|
| 71 |
+
model_id: deepseek-ai/DeepSeek-V3
|
| 72 |
+
gpu: H800
|
| 73 |
+
engine: vllm
|
| 74 |
+
expectations:
|
| 75 |
+
- field: attention_variant
|
| 76 |
+
expected: MLA
|
| 77 |
+
source: "config.json q_lora_rank=1536, no compress_ratios or nsa_config"
|
| 78 |
+
- field: is_moe
|
| 79 |
+
expected: true
|
| 80 |
+
source: "config.json n_routed_experts=256"
|
| 81 |
+
- field: quantization
|
| 82 |
+
expected: FP8
|
| 83 |
+
source: "config.json quantization_config.quant_method=fp8"
|
| 84 |
+
- field: weight_bytes
|
| 85 |
+
expected_min: 680_000_000_000
|
| 86 |
+
expected_max: 700_000_000_000
|
| 87 |
+
source: "HF siblings API — 671B params × 1 byte (FP8) ≈ 670 GB"
|
| 88 |
+
|
| 89 |
+
# ------------------------------------------------------------
|
| 90 |
+
# Mixtral 8x7B — dense-MoE variant, non-MLA
|
| 91 |
+
# ------------------------------------------------------------
|
| 92 |
+
- name: "Mixtral 8x7B on 4x H100 (standard MoE, no MLA)"
|
| 93 |
+
model_id: mistralai/Mixtral-8x7B-v0.1
|
| 94 |
+
gpu: H100
|
| 95 |
+
engine: vllm
|
| 96 |
+
expectations:
|
| 97 |
+
- field: attention_variant
|
| 98 |
+
expected: GQA
|
| 99 |
+
source: "config.json num_kv_heads=8 < num_attention_heads=32"
|
| 100 |
+
- field: is_moe
|
| 101 |
+
expected: true
|
| 102 |
+
source: "config.json num_local_experts=8"
|
| 103 |
+
- field: quantization
|
| 104 |
+
expected: FP16
|
| 105 |
+
source: "config.json torch_dtype=bfloat16, no quantization_config"
|
| 106 |
+
- field: weight_bytes
|
| 107 |
+
expected_min: 90_000_000_000
|
| 108 |
+
expected_max: 100_000_000_000
|
| 109 |
+
source: "HF siblings API — 46.7B total params × 2 bytes ≈ 93 GB"
|
| 110 |
+
|
| 111 |
+
# ------------------------------------------------------------
|
| 112 |
+
# DeepSeek-V3.2 — MLA structurally (NSA at runtime). Validates:
|
| 113 |
+
# - model_type=deepseek_v32 is recognized
|
| 114 |
+
# - FP8 quantization (inherited from V3)
|
| 115 |
+
# - Tool honestly reports MLA because config.json exposes only MLA
|
| 116 |
+
# keys; runtime NSA behavior is NOT in config. Future detection
|
| 117 |
+
# improvement could override based on model_type.
|
| 118 |
+
# ------------------------------------------------------------
|
| 119 |
+
- name: "DeepSeek-V3.2 on H800 (MLA config; NSA runtime)"
|
| 120 |
+
model_id: deepseek-ai/DeepSeek-V3.2
|
| 121 |
+
gpu: H800
|
| 122 |
+
engine: vllm
|
| 123 |
+
expectations:
|
| 124 |
+
- field: attention_variant
|
| 125 |
+
expected: MLA
|
| 126 |
+
source: >-
|
| 127 |
+
config.json q_lora_rank=1536, no nsa_config key — detector
|
| 128 |
+
correctly reports MLA. V3.2's NSA sparse behavior is a runtime
|
| 129 |
+
feature selected by vllm --attention-backend nsa, NOT encoded
|
| 130 |
+
in config.json keys. TODO: detector could upgrade to NSA when
|
| 131 |
+
model_type matches known NSA models.
|
| 132 |
+
- field: is_moe
|
| 133 |
+
expected: true
|
| 134 |
+
source: "config.json n_routed_experts=256"
|
| 135 |
+
- field: quantization
|
| 136 |
+
expected: FP8
|
| 137 |
+
source: "config.json quantization_config.quant_method=fp8"
|
| 138 |
+
|
| 139 |
+
# ------------------------------------------------------------
|
| 140 |
+
# Qwen3-30B-A3B — validates qwen3_moe model_type + GQA+MoE combo
|
| 141 |
+
# ------------------------------------------------------------
|
| 142 |
+
- name: "Qwen3-30B-A3B on H100 (Qwen3 MoE, GQA)"
|
| 143 |
+
model_id: Qwen/Qwen3-30B-A3B
|
| 144 |
+
gpu: H100
|
| 145 |
+
engine: vllm
|
| 146 |
+
expectations:
|
| 147 |
+
- field: attention_variant
|
| 148 |
+
expected: GQA
|
| 149 |
+
source: "config.json num_kv_heads=4 < num_attention_heads=32"
|
| 150 |
+
- field: is_moe
|
| 151 |
+
expected: true
|
| 152 |
+
source: "config.json num_local_experts or similar MoE key present"
|
| 153 |
+
- field: quantization
|
| 154 |
+
expected: FP16
|
| 155 |
+
source: "config.json torch_dtype=bfloat16"
|
| 156 |
+
- field: weight_bytes
|
| 157 |
+
expected_min: 58_000_000_000
|
| 158 |
+
expected_max: 65_000_000_000
|
| 159 |
+
source: "HF siblings API — 30.5B total params × 2 bytes ≈ 61 GB"
|
| 160 |
+
|
| 161 |
+
# ------------------------------------------------------------
|
| 162 |
+
# Qwen2.5-7B — small-model sanity + qwen2 model_type
|
| 163 |
+
# ------------------------------------------------------------
|
| 164 |
+
- name: "Qwen2.5-7B on H100 (small dense, sanity)"
|
| 165 |
+
model_id: Qwen/Qwen2.5-7B-Instruct
|
| 166 |
+
gpu: H100
|
| 167 |
+
engine: vllm
|
| 168 |
+
expectations:
|
| 169 |
+
- field: attention_variant
|
| 170 |
+
expected: GQA
|
| 171 |
+
source: "config.json num_kv_heads=4 < num_attention_heads=28"
|
| 172 |
+
- field: is_moe
|
| 173 |
+
expected: false
|
| 174 |
+
source: "config.json has no MoE keys"
|
| 175 |
+
- field: quantization
|
| 176 |
+
expected: FP16
|
| 177 |
+
source: "config.json torch_dtype=bfloat16"
|
| 178 |
+
- field: weight_bytes
|
| 179 |
+
expected_min: 14_000_000_000
|
| 180 |
+
expected_max: 16_000_000_000
|
| 181 |
+
source: "HF siblings API — 7.6B params × 2 bytes ≈ 15.2 GB"
|
| 182 |
+
|
| 183 |
+
# ------------------------------------------------------------
|
| 184 |
+
# Phi-4 — validates phi3 model_type + dense 14B
|
| 185 |
+
# ------------------------------------------------------------
|
| 186 |
+
- name: "Phi-4 on L40S (phi3 architecture, 14B dense)"
|
| 187 |
+
model_id: microsoft/Phi-4
|
| 188 |
+
gpu: L40S
|
| 189 |
+
engine: vllm
|
| 190 |
+
expectations:
|
| 191 |
+
- field: attention_variant
|
| 192 |
+
expected: GQA
|
| 193 |
+
source: "config.json num_kv_heads=10 < num_attention_heads=40"
|
| 194 |
+
- field: is_moe
|
| 195 |
+
expected: false
|
| 196 |
+
source: "config.json has no MoE keys"
|
| 197 |
+
- field: quantization
|
| 198 |
+
expected: FP16
|
| 199 |
+
source: "config.json torch_dtype=bfloat16"
|
| 200 |
+
- field: weight_bytes
|
| 201 |
+
expected_min: 28_000_000_000
|
| 202 |
+
expected_max: 31_000_000_000
|
| 203 |
+
source: "HF siblings API — 14.7B params × 2 bytes ≈ 29.3 GB"
|
src/llm_cal/benchmark/runner.py
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Benchmark runner — validate llm-cal's output against curated references.
|
| 2 |
+
|
| 3 |
+
For each entry in dataset.yaml, run the evaluator against the model, then
|
| 4 |
+
compare each `expectations[]` field with the predicted value. Report a
|
| 5 |
+
table of pass/fail per check, plus a summary.
|
| 6 |
+
|
| 7 |
+
This is NOT a synthetic benchmark. Every expected value cites a source
|
| 8 |
+
(HF API, model card text, vLLM recipe, hand computation) so users can
|
| 9 |
+
audit.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
from dataclasses import dataclass
|
| 15 |
+
from functools import lru_cache
|
| 16 |
+
from importlib.resources import files
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
from typing import Literal
|
| 19 |
+
|
| 20 |
+
from pydantic import BaseModel, Field
|
| 21 |
+
from rich.console import Console
|
| 22 |
+
from rich.table import Table
|
| 23 |
+
|
| 24 |
+
from llm_cal.common.yaml_loader import load_yaml
|
| 25 |
+
from llm_cal.core.evaluator import EvaluationReport, Evaluator
|
| 26 |
+
|
| 27 |
+
Status = Literal["PASS", "FAIL", "SKIP"]
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class Expectation(BaseModel):
|
| 31 |
+
field: str
|
| 32 |
+
# Exactly one of these is used depending on `field`
|
| 33 |
+
expected: str | int | bool | None = None
|
| 34 |
+
expected_min: int | None = None
|
| 35 |
+
expected_max: int | None = None
|
| 36 |
+
source: str
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class BenchmarkEntry(BaseModel):
|
| 40 |
+
name: str
|
| 41 |
+
model_id: str
|
| 42 |
+
gpu: str
|
| 43 |
+
engine: str = "vllm"
|
| 44 |
+
expectations: list[Expectation] = Field(default_factory=list)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class BenchmarkDataset(BaseModel):
|
| 48 |
+
schema_version: int
|
| 49 |
+
entries: list[BenchmarkEntry]
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
@dataclass(frozen=True)
|
| 53 |
+
class CheckResult:
|
| 54 |
+
entry_name: str
|
| 55 |
+
field: str
|
| 56 |
+
status: Status
|
| 57 |
+
predicted: str
|
| 58 |
+
expected: str
|
| 59 |
+
source: str
|
| 60 |
+
note: str | None = None
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def _default_dataset_path() -> Path:
|
| 64 |
+
return Path(str(files("llm_cal.benchmark").joinpath("dataset.yaml")))
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
@lru_cache(maxsize=1)
|
| 68 |
+
def load_dataset(path: Path | None = None) -> BenchmarkDataset:
|
| 69 |
+
return load_yaml(path or _default_dataset_path(), BenchmarkDataset)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def run_all(
|
| 73 |
+
evaluator: Evaluator | None = None,
|
| 74 |
+
dataset: BenchmarkDataset | None = None,
|
| 75 |
+
) -> list[CheckResult]:
|
| 76 |
+
"""Run every check in the dataset. Returns flat list of results."""
|
| 77 |
+
evaluator = evaluator or Evaluator()
|
| 78 |
+
dataset = dataset or load_dataset()
|
| 79 |
+
results: list[CheckResult] = []
|
| 80 |
+
for entry in dataset.entries:
|
| 81 |
+
try:
|
| 82 |
+
report = evaluator.evaluate(
|
| 83 |
+
model_id=entry.model_id,
|
| 84 |
+
gpu=entry.gpu,
|
| 85 |
+
engine=entry.engine,
|
| 86 |
+
)
|
| 87 |
+
except Exception as e:
|
| 88 |
+
for exp in entry.expectations:
|
| 89 |
+
results.append(
|
| 90 |
+
CheckResult(
|
| 91 |
+
entry_name=entry.name,
|
| 92 |
+
field=exp.field,
|
| 93 |
+
status="SKIP",
|
| 94 |
+
predicted="(evaluation failed)",
|
| 95 |
+
expected=_fmt_expected(exp),
|
| 96 |
+
source=exp.source,
|
| 97 |
+
note=f"{type(e).__name__}: {e}",
|
| 98 |
+
)
|
| 99 |
+
)
|
| 100 |
+
continue
|
| 101 |
+
for exp in entry.expectations:
|
| 102 |
+
results.append(_check_one(entry.name, report, exp))
|
| 103 |
+
return results
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def _check_one(entry_name: str, report: EvaluationReport, exp: Expectation) -> CheckResult:
|
| 107 |
+
predicted_str, status = _evaluate_field(report, exp)
|
| 108 |
+
return CheckResult(
|
| 109 |
+
entry_name=entry_name,
|
| 110 |
+
field=exp.field,
|
| 111 |
+
status=status,
|
| 112 |
+
predicted=predicted_str,
|
| 113 |
+
expected=_fmt_expected(exp),
|
| 114 |
+
source=exp.source,
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def _evaluate_field(report: EvaluationReport, exp: Expectation) -> tuple[str, Status]:
|
| 119 |
+
"""Return (predicted_str, PASS/FAIL/SKIP) for this field.
|
| 120 |
+
|
| 121 |
+
Each `field` name matches a documented check in dataset.yaml.
|
| 122 |
+
"""
|
| 123 |
+
if exp.field == "attention_variant":
|
| 124 |
+
attn_actual = report.profile.attention.variant if report.profile.attention else "(none)"
|
| 125 |
+
return attn_actual, ("PASS" if attn_actual == exp.expected else "FAIL")
|
| 126 |
+
|
| 127 |
+
if exp.field == "quantization":
|
| 128 |
+
quant_actual = report.weight.quantization_guess.value
|
| 129 |
+
return quant_actual, ("PASS" if quant_actual == exp.expected else "FAIL")
|
| 130 |
+
|
| 131 |
+
if exp.field == "is_moe":
|
| 132 |
+
actual_bool = report.profile.is_moe
|
| 133 |
+
return str(actual_bool), ("PASS" if actual_bool == exp.expected else "FAIL")
|
| 134 |
+
|
| 135 |
+
if exp.field == "weight_bytes":
|
| 136 |
+
actual_int = report.weight.total_bytes.value
|
| 137 |
+
low = exp.expected_min or 0
|
| 138 |
+
high = exp.expected_max or (1 << 62)
|
| 139 |
+
passed = low <= actual_int <= high
|
| 140 |
+
return f"{actual_int:,}", ("PASS" if passed else "FAIL")
|
| 141 |
+
|
| 142 |
+
if exp.field == "fleet_prod_gpus":
|
| 143 |
+
if report.fleet is None:
|
| 144 |
+
return "(no fleet)", "SKIP"
|
| 145 |
+
prod = next((o for o in report.fleet.options if o.tier == "prod"), None)
|
| 146 |
+
if prod is None:
|
| 147 |
+
return "(no prod tier)", "SKIP"
|
| 148 |
+
passed = prod.gpu_count == exp.expected
|
| 149 |
+
return str(prod.gpu_count), ("PASS" if passed else "FAIL")
|
| 150 |
+
|
| 151 |
+
if exp.field == "fleet_prod_gpus_at_most":
|
| 152 |
+
if report.fleet is None:
|
| 153 |
+
return "(no fleet)", "SKIP"
|
| 154 |
+
prod = next((o for o in report.fleet.options if o.tier == "prod"), None)
|
| 155 |
+
if prod is None:
|
| 156 |
+
return "(no prod tier)", "SKIP"
|
| 157 |
+
passed = prod.gpu_count <= int(exp.expected or 0)
|
| 158 |
+
return f"{prod.gpu_count} (max {exp.expected})", ("PASS" if passed else "FAIL")
|
| 159 |
+
|
| 160 |
+
return "(unknown field)", "SKIP"
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def _fmt_expected(exp: Expectation) -> str:
|
| 164 |
+
if exp.expected is not None:
|
| 165 |
+
return str(exp.expected)
|
| 166 |
+
if exp.expected_min is not None or exp.expected_max is not None:
|
| 167 |
+
lo = f"{exp.expected_min:,}" if exp.expected_min is not None else "-∞"
|
| 168 |
+
hi = f"{exp.expected_max:,}" if exp.expected_max is not None else "+∞"
|
| 169 |
+
return f"[{lo}, {hi}]"
|
| 170 |
+
return "(unspecified)"
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def render_results(results: list[CheckResult], console: Console | None = None) -> None:
|
| 174 |
+
console = console or Console()
|
| 175 |
+
|
| 176 |
+
table = Table(
|
| 177 |
+
title="Benchmark results",
|
| 178 |
+
title_justify="left",
|
| 179 |
+
show_header=True,
|
| 180 |
+
header_style="dim",
|
| 181 |
+
box=None,
|
| 182 |
+
padding=(0, 2),
|
| 183 |
+
)
|
| 184 |
+
table.add_column("entry")
|
| 185 |
+
table.add_column("field")
|
| 186 |
+
table.add_column("predicted")
|
| 187 |
+
table.add_column("expected")
|
| 188 |
+
table.add_column("status")
|
| 189 |
+
|
| 190 |
+
status_styles = {
|
| 191 |
+
"PASS": "bold green",
|
| 192 |
+
"FAIL": "bold red",
|
| 193 |
+
"SKIP": "dim yellow",
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
current_entry = None
|
| 197 |
+
for r in results:
|
| 198 |
+
entry_cell = r.entry_name if r.entry_name != current_entry else ""
|
| 199 |
+
current_entry = r.entry_name
|
| 200 |
+
table.add_row(
|
| 201 |
+
entry_cell,
|
| 202 |
+
r.field,
|
| 203 |
+
r.predicted,
|
| 204 |
+
r.expected,
|
| 205 |
+
f"[{status_styles[r.status]}]{r.status}[/]",
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
console.print(table)
|
| 209 |
+
|
| 210 |
+
total = len(results)
|
| 211 |
+
passed = sum(1 for r in results if r.status == "PASS")
|
| 212 |
+
failed = sum(1 for r in results if r.status == "FAIL")
|
| 213 |
+
skipped = sum(1 for r in results if r.status == "SKIP")
|
| 214 |
+
|
| 215 |
+
summary = (
|
| 216 |
+
f"Total: {total} "
|
| 217 |
+
f"[bold green]PASS: {passed}[/] "
|
| 218 |
+
f"[bold red]FAIL: {failed}[/] "
|
| 219 |
+
f"[dim yellow]SKIP: {skipped}[/]"
|
| 220 |
+
)
|
| 221 |
+
console.print(summary)
|
| 222 |
+
|
| 223 |
+
if failed > 0:
|
| 224 |
+
console.print(
|
| 225 |
+
"[dim]Failures show the tool's prediction diverges from a curated "
|
| 226 |
+
"source. Check the `source` column for the expected-value provenance.[/]"
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
def exit_code_from(results: list[CheckResult]) -> int:
|
| 231 |
+
"""0 if all PASS or only SKIP; 1 if any FAIL."""
|
| 232 |
+
return 1 if any(r.status == "FAIL" for r in results) else 0
|
src/llm_cal/cli.py
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""CLI entry point. Thin shell over `Evaluator` + rich formatter."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import sys
|
| 6 |
+
|
| 7 |
+
import typer
|
| 8 |
+
from rich.console import Console
|
| 9 |
+
|
| 10 |
+
from llm_cal.benchmark.runner import exit_code_from, render_results, run_all
|
| 11 |
+
from llm_cal.common.i18n import detect_locale_from_env, get_locale, set_locale, t
|
| 12 |
+
from llm_cal.core.evaluator import Evaluator
|
| 13 |
+
from llm_cal.core.explain import build as build_explain
|
| 14 |
+
from llm_cal.hardware.loader import load_database
|
| 15 |
+
from llm_cal.llm_review.reviewer import run_review
|
| 16 |
+
from llm_cal.model_source.base import (
|
| 17 |
+
AuthRequiredError,
|
| 18 |
+
ModelNotFoundError,
|
| 19 |
+
ModelSource,
|
| 20 |
+
SourceUnavailableError,
|
| 21 |
+
)
|
| 22 |
+
from llm_cal.model_source.huggingface import HuggingFaceSource
|
| 23 |
+
from llm_cal.model_source.modelscope import ModelScopeSource
|
| 24 |
+
from llm_cal.output.formatter import (
|
| 25 |
+
render,
|
| 26 |
+
render_explain,
|
| 27 |
+
render_gpu_list,
|
| 28 |
+
render_llm_review,
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
# Set locale from env first; --lang flag can override inside main()
|
| 32 |
+
set_locale(detect_locale_from_env())
|
| 33 |
+
|
| 34 |
+
app = typer.Typer(
|
| 35 |
+
name="llm-cal",
|
| 36 |
+
help="LLM inference hardware calculator.",
|
| 37 |
+
no_args_is_help=True,
|
| 38 |
+
)
|
| 39 |
+
_console = Console()
|
| 40 |
+
_err = Console(stderr=True)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@app.command()
|
| 44 |
+
def main(
|
| 45 |
+
model_id: str | None = typer.Argument(None, help="HuggingFace or ModelScope model id"),
|
| 46 |
+
gpu: str | None = typer.Option(None, "--gpu", help="GPU type, e.g. H800, A100-80G"),
|
| 47 |
+
engine: str = typer.Option("vllm", "--engine", help="Inference engine: vllm | sglang"),
|
| 48 |
+
gpu_count: int | None = typer.Option(
|
| 49 |
+
None, "--gpu-count", help="Force GPU count (otherwise tool recommends)"
|
| 50 |
+
),
|
| 51 |
+
context_length: int | None = typer.Option(
|
| 52 |
+
None, "--context-length", help="Context length for KV cache estimation"
|
| 53 |
+
),
|
| 54 |
+
refresh: bool = typer.Option(False, "--refresh", help="Bypass cache and re-fetch"),
|
| 55 |
+
lang: str | None = typer.Option(
|
| 56 |
+
None,
|
| 57 |
+
"--lang",
|
| 58 |
+
help="Output language: en | zh (default auto-detects from LANG env)",
|
| 59 |
+
),
|
| 60 |
+
list_gpus: bool = typer.Option(
|
| 61 |
+
False,
|
| 62 |
+
"--list-gpus",
|
| 63 |
+
help="List all supported GPUs and exit (no model_id needed)",
|
| 64 |
+
),
|
| 65 |
+
benchmark: bool = typer.Option(
|
| 66 |
+
False,
|
| 67 |
+
"--benchmark",
|
| 68 |
+
help=(
|
| 69 |
+
"Run the curated benchmark dataset: compare tool output against "
|
| 70 |
+
"reference values from HF API, model cards, vLLM recipes. "
|
| 71 |
+
"Requires network. Exit 0 on all-pass, 1 if any FAIL."
|
| 72 |
+
),
|
| 73 |
+
),
|
| 74 |
+
input_tokens: int = typer.Option(
|
| 75 |
+
2000,
|
| 76 |
+
"--input-tokens",
|
| 77 |
+
help="Input token budget for prefill-latency estimation (default: 2000).",
|
| 78 |
+
),
|
| 79 |
+
output_tokens: int = typer.Option(
|
| 80 |
+
512,
|
| 81 |
+
"--output-tokens",
|
| 82 |
+
help="Output token budget for total-latency math (default: 512).",
|
| 83 |
+
),
|
| 84 |
+
target_tokens_per_sec: float = typer.Option(
|
| 85 |
+
30.0,
|
| 86 |
+
"--target-tokens-per-sec",
|
| 87 |
+
help="SLA: per-user decode tokens/second (drives L bound). Default: 30.",
|
| 88 |
+
),
|
| 89 |
+
prefill_util: float = typer.Option(
|
| 90 |
+
0.40,
|
| 91 |
+
"--prefill-util",
|
| 92 |
+
help="Compute utilization factor for prefill (empirical, default 0.40).",
|
| 93 |
+
),
|
| 94 |
+
decode_bw_util: float = typer.Option(
|
| 95 |
+
0.50,
|
| 96 |
+
"--decode-bw-util",
|
| 97 |
+
help="Memory-bandwidth utilization factor for decode (default 0.50).",
|
| 98 |
+
),
|
| 99 |
+
concurrency_degradation: float = typer.Option(
|
| 100 |
+
1.0,
|
| 101 |
+
"--concurrency-degradation",
|
| 102 |
+
help=(
|
| 103 |
+
"High-concurrency throughput degradation factor (default 1.0 = "
|
| 104 |
+
"no degradation — the honest baseline). If your engine drops "
|
| 105 |
+
"to 60% efficiency under load, pass 1.67. See docs/methodology.md."
|
| 106 |
+
),
|
| 107 |
+
),
|
| 108 |
+
explain: bool = typer.Option(
|
| 109 |
+
False,
|
| 110 |
+
"--explain",
|
| 111 |
+
help=(
|
| 112 |
+
"Print the full derivation trace (formula, inputs, step-by-step, "
|
| 113 |
+
"source) for every non-trivial number. Feed the output to an LLM "
|
| 114 |
+
"if you want a second opinion on the math."
|
| 115 |
+
),
|
| 116 |
+
),
|
| 117 |
+
llm_review: bool = typer.Option(
|
| 118 |
+
False,
|
| 119 |
+
"--llm-review",
|
| 120 |
+
help=(
|
| 121 |
+
"EXPERIMENTAL: send the derivation trace to an LLM for a second "
|
| 122 |
+
"opinion. Output is tagged [llm-opinion] and never overrides the "
|
| 123 |
+
"6 primary labels. Requires env vars: LLM_CAL_REVIEWER_API_KEY "
|
| 124 |
+
"(required), LLM_CAL_REVIEWER_BASE_URL (default OpenAI), "
|
| 125 |
+
"LLM_CAL_REVIEWER_MODEL (default gpt-4o)."
|
| 126 |
+
),
|
| 127 |
+
),
|
| 128 |
+
source: str = typer.Option(
|
| 129 |
+
"huggingface",
|
| 130 |
+
"--source",
|
| 131 |
+
help=(
|
| 132 |
+
"Model source: huggingface (default) | modelscope. "
|
| 133 |
+
"Auth via HF_TOKEN or MODELSCOPE_API_TOKEN env var."
|
| 134 |
+
),
|
| 135 |
+
),
|
| 136 |
+
) -> None:
|
| 137 |
+
"""Evaluate a model against target hardware."""
|
| 138 |
+
if lang in ("en", "zh"):
|
| 139 |
+
set_locale(lang) # type: ignore[arg-type]
|
| 140 |
+
|
| 141 |
+
# Meta commands short-circuit before requiring model_id + --gpu.
|
| 142 |
+
if list_gpus:
|
| 143 |
+
render_gpu_list(load_database(), _console)
|
| 144 |
+
return
|
| 145 |
+
|
| 146 |
+
if benchmark:
|
| 147 |
+
results = run_all()
|
| 148 |
+
render_results(results, _console)
|
| 149 |
+
sys.exit(exit_code_from(results))
|
| 150 |
+
|
| 151 |
+
if not model_id:
|
| 152 |
+
_err.print("[red]Missing argument MODEL_ID. Use --help for usage.[/red]")
|
| 153 |
+
raise typer.Exit(code=1)
|
| 154 |
+
if not gpu:
|
| 155 |
+
_err.print("[red]Missing option --gpu. Use --list-gpus to see choices.[/red]")
|
| 156 |
+
raise typer.Exit(code=1)
|
| 157 |
+
|
| 158 |
+
src_obj: ModelSource
|
| 159 |
+
src_lower = source.lower()
|
| 160 |
+
if src_lower in ("hf", "huggingface"):
|
| 161 |
+
src_obj = HuggingFaceSource()
|
| 162 |
+
elif src_lower in ("ms", "modelscope"):
|
| 163 |
+
src_obj = ModelScopeSource()
|
| 164 |
+
else:
|
| 165 |
+
_err.print(
|
| 166 |
+
f"[red]Unknown --source '{source}'. Use 'huggingface' or 'modelscope'.[/red]"
|
| 167 |
+
)
|
| 168 |
+
raise typer.Exit(code=1)
|
| 169 |
+
|
| 170 |
+
evaluator = Evaluator(source=src_obj)
|
| 171 |
+
try:
|
| 172 |
+
report = evaluator.evaluate(
|
| 173 |
+
model_id=model_id,
|
| 174 |
+
gpu=gpu,
|
| 175 |
+
engine=engine,
|
| 176 |
+
gpu_count=gpu_count,
|
| 177 |
+
context_length=context_length,
|
| 178 |
+
refresh=refresh,
|
| 179 |
+
input_tokens=input_tokens,
|
| 180 |
+
output_tokens=output_tokens,
|
| 181 |
+
target_tokens_per_sec=target_tokens_per_sec,
|
| 182 |
+
prefill_utilization=prefill_util,
|
| 183 |
+
decode_bw_utilization=decode_bw_util,
|
| 184 |
+
concurrency_degradation=concurrency_degradation,
|
| 185 |
+
)
|
| 186 |
+
except AuthRequiredError as e:
|
| 187 |
+
_err.print(f"[bold red]{t('cli.err.auth_required')}[/bold red] {e}")
|
| 188 |
+
sys.exit(2)
|
| 189 |
+
except ModelNotFoundError as e:
|
| 190 |
+
_err.print(f"[bold red]{t('cli.err.model_not_found')}[/bold red] {e}")
|
| 191 |
+
sys.exit(3)
|
| 192 |
+
except SourceUnavailableError as e:
|
| 193 |
+
_err.print(f"[bold red]{t('cli.err.source_unavailable')}[/bold red] {e}")
|
| 194 |
+
sys.exit(4)
|
| 195 |
+
|
| 196 |
+
render(report, _console)
|
| 197 |
+
explain_entries = build_explain(report) if (explain or llm_review) else []
|
| 198 |
+
if explain:
|
| 199 |
+
render_explain(explain_entries, _console)
|
| 200 |
+
if llm_review:
|
| 201 |
+
# Locale at this point has been resolved by set_locale() calls above.
|
| 202 |
+
result = run_review(explain_entries, locale=get_locale())
|
| 203 |
+
render_llm_review(result, _console)
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
if __name__ == "__main__":
|
| 207 |
+
app()
|
src/llm_cal/command_generator/__init__.py
ADDED
|
File without changes
|
src/llm_cal/command_generator/sglang.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Generate a ready-to-copy SGLang launch command."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from llm_cal.architecture.profile import ArchitectureProfile
|
| 6 |
+
from llm_cal.engine_compat.loader import EngineCompatEntry
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def generate_sglang_command(
|
| 10 |
+
model_id: str,
|
| 11 |
+
profile: ArchitectureProfile,
|
| 12 |
+
tensor_parallel_size: int,
|
| 13 |
+
entry: EngineCompatEntry | None,
|
| 14 |
+
max_model_len: int | None = None,
|
| 15 |
+
) -> str:
|
| 16 |
+
"""Generate a multi-line `python -m sglang.launch_server ...` command string."""
|
| 17 |
+
lines: list[str] = [
|
| 18 |
+
"python -m sglang.launch_server",
|
| 19 |
+
f" --model-path {model_id}",
|
| 20 |
+
f" --tp {tensor_parallel_size}",
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
effective_max = max_model_len
|
| 24 |
+
if effective_max is None and profile.position is not None:
|
| 25 |
+
effective_max = profile.position.max_position_embeddings
|
| 26 |
+
if effective_max:
|
| 27 |
+
lines.append(f" --context-length {effective_max}")
|
| 28 |
+
|
| 29 |
+
if _needs_trust_remote_code(profile.model_type):
|
| 30 |
+
lines.append(" --trust-remote-code")
|
| 31 |
+
|
| 32 |
+
lines.append(" --mem-fraction-static 0.9")
|
| 33 |
+
|
| 34 |
+
if entry is not None:
|
| 35 |
+
for flag in entry.required_flags:
|
| 36 |
+
lines.append(" " + _render_flag(flag.flag, flag.value))
|
| 37 |
+
for flag in entry.optional_flags:
|
| 38 |
+
lines.append(" " + _render_flag(flag.flag, flag.value))
|
| 39 |
+
|
| 40 |
+
return " \\\n".join(lines)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _render_flag(flag: str, value: str | None) -> str:
|
| 44 |
+
if value is None:
|
| 45 |
+
return flag
|
| 46 |
+
return f"{flag} {value}"
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def _needs_trust_remote_code(model_type: str) -> bool:
|
| 50 |
+
return model_type.startswith(("deepseek", "qwen2_moe", "qwen3_moe", "mixtral"))
|
src/llm_cal/command_generator/vllm.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Generate a ready-to-copy vllm serve command."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from llm_cal.architecture.profile import ArchitectureProfile
|
| 6 |
+
from llm_cal.engine_compat.loader import EngineCompatEntry
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def generate_vllm_command(
|
| 10 |
+
model_id: str,
|
| 11 |
+
profile: ArchitectureProfile,
|
| 12 |
+
tensor_parallel_size: int,
|
| 13 |
+
entry: EngineCompatEntry | None,
|
| 14 |
+
max_model_len: int | None = None,
|
| 15 |
+
) -> str:
|
| 16 |
+
"""Generate a multi-line `vllm serve ...` command string.
|
| 17 |
+
|
| 18 |
+
If `entry` is given, appends required_flags and optional_flags verbatim.
|
| 19 |
+
"""
|
| 20 |
+
lines: list[str] = [
|
| 21 |
+
"vllm serve " + model_id,
|
| 22 |
+
f" --tensor-parallel-size {tensor_parallel_size}",
|
| 23 |
+
]
|
| 24 |
+
|
| 25 |
+
# Pick max-model-len from profile if caller didn't override.
|
| 26 |
+
effective_max = max_model_len
|
| 27 |
+
if effective_max is None and profile.position is not None:
|
| 28 |
+
effective_max = profile.position.max_position_embeddings
|
| 29 |
+
if effective_max:
|
| 30 |
+
lines.append(f" --max-model-len {effective_max}")
|
| 31 |
+
|
| 32 |
+
# DeepSeek and friends need trust-remote-code. Heuristic: non-trivial model_type.
|
| 33 |
+
if _needs_trust_remote_code(profile.model_type):
|
| 34 |
+
lines.append(" --trust-remote-code")
|
| 35 |
+
|
| 36 |
+
lines.append(" --gpu-memory-utilization 0.9")
|
| 37 |
+
|
| 38 |
+
if entry is not None:
|
| 39 |
+
for flag in entry.required_flags:
|
| 40 |
+
lines.append(" " + _render_flag(flag.flag, flag.value))
|
| 41 |
+
for flag in entry.optional_flags:
|
| 42 |
+
lines.append(" " + _render_flag(flag.flag, flag.value))
|
| 43 |
+
|
| 44 |
+
return " \\\n".join(lines)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def _render_flag(flag: str, value: str | None) -> str:
|
| 48 |
+
if value is None:
|
| 49 |
+
return flag
|
| 50 |
+
return f"{flag} {value}"
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def _needs_trust_remote_code(model_type: str) -> bool:
|
| 54 |
+
"""Models that ship custom modeling code in the repo."""
|
| 55 |
+
return model_type.startswith(("deepseek", "qwen2_moe", "qwen3_moe", "mixtral"))
|
src/llm_cal/common/__init__.py
ADDED
|
File without changes
|
src/llm_cal/common/i18n.py
ADDED
|
@@ -0,0 +1,421 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Minimal i18n layer. No gettext, no external deps.
|
| 2 |
+
|
| 3 |
+
Supports `en` and `zh`. Defaults to `en` but auto-detects from LC_ALL/LANG
|
| 4 |
+
when they start with `zh` (covers zh_CN, zh_TW, zh_HK, etc.).
|
| 5 |
+
|
| 6 |
+
Usage:
|
| 7 |
+
from llm_cal.common.i18n import t, set_locale
|
| 8 |
+
set_locale("zh")
|
| 9 |
+
print(t("labels.legend")) # "标签"
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import os
|
| 15 |
+
from typing import Literal
|
| 16 |
+
|
| 17 |
+
Locale = Literal["en", "zh"]
|
| 18 |
+
|
| 19 |
+
_current_locale: Locale = "en"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
_MESSAGES: dict[str, dict[Locale, str]] = {
|
| 23 |
+
# CLI help text
|
| 24 |
+
"cli.help": {
|
| 25 |
+
"en": "LLM inference hardware calculator.",
|
| 26 |
+
"zh": "大模型推理硬件计算器。",
|
| 27 |
+
},
|
| 28 |
+
"cli.arg.model_id": {
|
| 29 |
+
"en": "HuggingFace or ModelScope model id",
|
| 30 |
+
"zh": "HuggingFace 或 ModelScope 的 model id",
|
| 31 |
+
},
|
| 32 |
+
"cli.opt.gpu": {
|
| 33 |
+
"en": "GPU type, e.g. H800, A100-80G",
|
| 34 |
+
"zh": "GPU 型号,例如 H800、A100-80G",
|
| 35 |
+
},
|
| 36 |
+
"cli.opt.engine": {
|
| 37 |
+
"en": "Inference engine: vllm | sglang",
|
| 38 |
+
"zh": "推理引擎:vllm | sglang",
|
| 39 |
+
},
|
| 40 |
+
"cli.opt.gpu_count": {
|
| 41 |
+
"en": "Force GPU count (otherwise tool recommends min/dev/prod)",
|
| 42 |
+
"zh": "强制指定 GPU 张数(默认由工具推荐 min/dev/prod 三档)",
|
| 43 |
+
},
|
| 44 |
+
"cli.opt.context_length": {
|
| 45 |
+
"en": "Context length for KV cache estimation",
|
| 46 |
+
"zh": "用于 KV cache 估算的上下文长度",
|
| 47 |
+
},
|
| 48 |
+
"cli.opt.refresh": {
|
| 49 |
+
"en": "Bypass cache and re-fetch",
|
| 50 |
+
"zh": "绕过缓存重新拉取",
|
| 51 |
+
},
|
| 52 |
+
"cli.opt.lang": {
|
| 53 |
+
"en": "Output language: en | zh",
|
| 54 |
+
"zh": "输出语言:en | zh",
|
| 55 |
+
},
|
| 56 |
+
"cli.err.auth_required": {
|
| 57 |
+
"en": "Authentication required:",
|
| 58 |
+
"zh": "需要认证:",
|
| 59 |
+
},
|
| 60 |
+
"cli.err.model_not_found": {
|
| 61 |
+
"en": "Model not found:",
|
| 62 |
+
"zh": "模型未找到:",
|
| 63 |
+
},
|
| 64 |
+
"cli.err.source_unavailable": {
|
| 65 |
+
"en": "Source unavailable:",
|
| 66 |
+
"zh": "数据源不可用:",
|
| 67 |
+
},
|
| 68 |
+
# Panel / section titles
|
| 69 |
+
"panel.via": {"en": "via", "zh": "来源"},
|
| 70 |
+
"section.architecture": {"en": "Architecture", "zh": "架构"},
|
| 71 |
+
"section.weights": {"en": "Weights", "zh": "权重"},
|
| 72 |
+
"section.kv_cache": {
|
| 73 |
+
"en": "KV cache per request (BF16/FP16)",
|
| 74 |
+
"zh": "单请求 KV Cache(BF16/FP16)",
|
| 75 |
+
},
|
| 76 |
+
"section.reconciliation": {
|
| 77 |
+
"en": "Quantization reconciliation (observed vs predicted per scheme)",
|
| 78 |
+
"zh": "量化方案对账(观测值 vs 各方案预测值)",
|
| 79 |
+
},
|
| 80 |
+
"section.engine_compat": {
|
| 81 |
+
"en": "Engine compatibility",
|
| 82 |
+
"zh": "推理引擎兼容性",
|
| 83 |
+
},
|
| 84 |
+
"section.hardware": {"en": "Target hardware", "zh": "目标硬件"},
|
| 85 |
+
"section.labels": {"en": "labels:", "zh": "标签:"},
|
| 86 |
+
# Architecture row labels
|
| 87 |
+
"arch.model_type": {"en": "model_type", "zh": "模型类型"},
|
| 88 |
+
"arch.family": {"en": "family", "zh": "架构族"},
|
| 89 |
+
"arch.confidence": {"en": "confidence", "zh": "识别置信度"},
|
| 90 |
+
"arch.layers": {"en": "layers", "zh": "层数"},
|
| 91 |
+
"arch.hidden_size": {"en": "hidden_size", "zh": "隐藏维度"},
|
| 92 |
+
"arch.vocab_size": {"en": "vocab_size", "zh": "词表大小"},
|
| 93 |
+
"arch.attention": {"en": "attention", "zh": "注意力机制"},
|
| 94 |
+
"arch.compress_ratios": {"en": "compress_ratios", "zh": "压缩比数组"},
|
| 95 |
+
"arch.moe": {"en": "moe", "zh": "MoE"},
|
| 96 |
+
"arch.sliding_window": {"en": "sliding_window", "zh": "滑动窗口"},
|
| 97 |
+
"arch.max_position": {
|
| 98 |
+
"en": "max_position_embeddings",
|
| 99 |
+
"zh": "最大上下文长度",
|
| 100 |
+
},
|
| 101 |
+
"arch.none": {"en": "(none)", "zh": "(无)"},
|
| 102 |
+
"arch.compress_ratios_summary": {
|
| 103 |
+
"en": "len={n}, dense_layers={dense}",
|
| 104 |
+
"zh": "长度={n},dense 层数={dense}",
|
| 105 |
+
},
|
| 106 |
+
"arch.moe_summary": {
|
| 107 |
+
"en": "{routed} routed + {shared} shared, top-{topk}",
|
| 108 |
+
"zh": "{routed} 个 routed + {shared} 个 shared,top-{topk}",
|
| 109 |
+
},
|
| 110 |
+
"arch.attn_summary": {
|
| 111 |
+
"en": "{variant} (heads={heads}, kv_heads={kv_heads}, head_dim={head_dim})",
|
| 112 |
+
"zh": "{variant}(heads={heads},kv_heads={kv_heads},head_dim={head_dim})",
|
| 113 |
+
},
|
| 114 |
+
"arch.unsupported_state_space": {
|
| 115 |
+
"en": "State-space models are not supported in v0.1 (planned for v0.3+).",
|
| 116 |
+
"zh": "状态空间模型(Mamba 类)在 v0.1 暂不支持,计划在 v0.3+ 加入。",
|
| 117 |
+
},
|
| 118 |
+
# Weights rows
|
| 119 |
+
"weights.safetensors_bytes": {
|
| 120 |
+
"en": "safetensors bytes",
|
| 121 |
+
"zh": "safetensors 总字节",
|
| 122 |
+
},
|
| 123 |
+
"weights.params_estimated": {
|
| 124 |
+
"en": "estimated total params",
|
| 125 |
+
"zh": "参数量(估算)",
|
| 126 |
+
},
|
| 127 |
+
"weights.bits_per_param": {"en": "bits/param", "zh": "每参数位数"},
|
| 128 |
+
"weights.quant_guess": {"en": "quantization guess", "zh": "量化方案推断"},
|
| 129 |
+
# Reconciliation
|
| 130 |
+
"recon.scheme": {"en": "scheme", "zh": "量化方案"},
|
| 131 |
+
"recon.predicted": {"en": "predicted bytes", "zh": "预测字节"},
|
| 132 |
+
"recon.delta": {"en": "delta", "zh": "差值"},
|
| 133 |
+
"recon.error_pct": {"en": "error %", "zh": "误差 %"},
|
| 134 |
+
"recon.over": {"en": "over", "zh": "偏高"},
|
| 135 |
+
"recon.under": {"en": "under", "zh": "偏低"},
|
| 136 |
+
"recon.best": {"en": "best match:", "zh": "最佳匹配:"},
|
| 137 |
+
# KV cache
|
| 138 |
+
"kv.context": {"en": "context", "zh": "上下文"},
|
| 139 |
+
"kv.kv_cache": {"en": "KV cache", "zh": "KV Cache"},
|
| 140 |
+
"kv.label": {"en": "label", "zh": "标签"},
|
| 141 |
+
"kv.tokens": {"en": "tokens", "zh": "tokens"},
|
| 142 |
+
# Engine compatibility
|
| 143 |
+
"engine.version_spec": {"en": "version", "zh": "版本要求"},
|
| 144 |
+
"engine.support": {"en": "support", "zh": "支持程度"},
|
| 145 |
+
"engine.verification": {"en": "verification", "zh": "验证等级"},
|
| 146 |
+
"engine.required_flags": {"en": "required flags", "zh": "必需参数"},
|
| 147 |
+
"engine.optional_flags": {"en": "optional flags", "zh": "可选参数"},
|
| 148 |
+
"engine.caveats": {"en": "caveats", "zh": "注意事项"},
|
| 149 |
+
"engine.sources": {"en": "sources", "zh": "来源"},
|
| 150 |
+
"engine.no_match": {
|
| 151 |
+
"en": "No compatibility entry for this model + engine in v0.1 matrix.",
|
| 152 |
+
"zh": "v0.1 兼容矩阵中暂无此模型 + 引擎的条目。",
|
| 153 |
+
},
|
| 154 |
+
# Hardware
|
| 155 |
+
"hw.memory": {"en": "memory", "zh": "显存"},
|
| 156 |
+
"hw.nvlink_bandwidth": {"en": "NVLink bandwidth", "zh": "NVLink 带宽"},
|
| 157 |
+
"hw.fp16_tflops": {"en": "FP16 TFLOPS", "zh": "FP16 算力"},
|
| 158 |
+
"hw.fp8_support": {"en": "FP8 support", "zh": "FP8 支持"},
|
| 159 |
+
"hw.fp4_support": {"en": "FP4 support", "zh": "FP4 支持"},
|
| 160 |
+
"hw.notes": {"en": "notes", "zh": "备注"},
|
| 161 |
+
"hw.spec_source": {"en": "spec source", "zh": "规格来源"},
|
| 162 |
+
# GPU list subcommand
|
| 163 |
+
"gpus.list.title": {
|
| 164 |
+
"en": "Supported GPUs",
|
| 165 |
+
"zh": "支持的 GPU",
|
| 166 |
+
},
|
| 167 |
+
"gpus.col.id": {"en": "id", "zh": "型号"},
|
| 168 |
+
"gpus.col.memory": {"en": "memory", "zh": "显存"},
|
| 169 |
+
"gpus.col.nvlink": {"en": "NVLink / fabric", "zh": "互联带宽"},
|
| 170 |
+
"gpus.col.fp16": {"en": "FP16 TFLOPS", "zh": "FP16"},
|
| 171 |
+
"gpus.col.fp8": {"en": "FP8", "zh": "FP8"},
|
| 172 |
+
"gpus.col.fp4": {"en": "FP4", "zh": "FP4"},
|
| 173 |
+
"gpus.col.aliases": {"en": "aliases", "zh": "别名"},
|
| 174 |
+
"gpus.total": {
|
| 175 |
+
"en": "Total: {count} GPUs (pass any id or alias to --gpu)",
|
| 176 |
+
"zh": "共 {count} 款(--gpu 后面填 ID 或别名均可)",
|
| 177 |
+
},
|
| 178 |
+
"hw.unknown": {
|
| 179 |
+
"en": "Unknown GPU '{gpu}'. Known: {known}",
|
| 180 |
+
"zh": "未知 GPU '{gpu}'。已知型号:{known}",
|
| 181 |
+
},
|
| 182 |
+
"hw.bool_yes": {"en": "yes", "zh": "是"},
|
| 183 |
+
"hw.bool_no": {"en": "no", "zh": "否"},
|
| 184 |
+
# Labels — localized display names. Enum identity stays English.
|
| 185 |
+
"label.verified": {"en": "verified", "zh": "已验证"},
|
| 186 |
+
"label.inferred": {"en": "inferred", "zh": "推断"},
|
| 187 |
+
"label.estimated": {"en": "estimated", "zh": "估算"},
|
| 188 |
+
"label.cited": {"en": "cited", "zh": "引用"},
|
| 189 |
+
"label.unverified": {"en": "unverified", "zh": "未经验证"},
|
| 190 |
+
"label.unknown": {"en": "unknown", "zh": "未知"},
|
| 191 |
+
"label.llm-opinion": {"en": "llm-opinion", "zh": "LLM 观点"},
|
| 192 |
+
# Source attribution
|
| 193 |
+
"source.pr": {"en": "PR", "zh": "PR"},
|
| 194 |
+
"source.release_notes": {"en": "release notes", "zh": "release note"},
|
| 195 |
+
"source.announcement": {"en": "announcement", "zh": "官方公告"},
|
| 196 |
+
"source.tested": {"en": "tested", "zh": "实测"},
|
| 197 |
+
"source.captured_on": {"en": "captured on", "zh": "采集于"},
|
| 198 |
+
# Fleet planner
|
| 199 |
+
"section.fleet": {
|
| 200 |
+
"en": "Recommended fleet",
|
| 201 |
+
"zh": "推荐 GPU 张数",
|
| 202 |
+
},
|
| 203 |
+
"fleet.col.tier": {"en": "tier", "zh": "档位"},
|
| 204 |
+
"fleet.col.gpus": {"en": "GPUs", "zh": "GPU 数"},
|
| 205 |
+
"fleet.col.weight_per_gpu": {
|
| 206 |
+
"en": "weight / GPU",
|
| 207 |
+
"zh": "单卡权重",
|
| 208 |
+
},
|
| 209 |
+
"fleet.col.headroom_per_gpu": {
|
| 210 |
+
"en": "headroom / GPU",
|
| 211 |
+
"zh": "单卡余量",
|
| 212 |
+
},
|
| 213 |
+
"fleet.col.fit": {"en": "fit", "zh": "评估"},
|
| 214 |
+
"fleet.col.concurrent_at_ctx": {
|
| 215 |
+
"en": "concurrent @ {ctx}",
|
| 216 |
+
"zh": "并发 @ {ctx}",
|
| 217 |
+
},
|
| 218 |
+
"fleet.tier.min": {"en": "min", "zh": "最小"},
|
| 219 |
+
"fleet.tier.dev": {"en": "dev", "zh": "开发"},
|
| 220 |
+
"fleet.tier.prod": {"en": "prod", "zh": "生产"},
|
| 221 |
+
"fleet.best_marker": {
|
| 222 |
+
"en": "= recommended",
|
| 223 |
+
"zh": "= 推荐档位",
|
| 224 |
+
},
|
| 225 |
+
"fleet.constraint": {"en": "constraint:", "zh": "约束:"},
|
| 226 |
+
"fleet.forced": {
|
| 227 |
+
"en": "Forced GPU count (--gpu-count was set)",
|
| 228 |
+
"zh": "已强制指定 GPU 张数(--gpu-count)",
|
| 229 |
+
},
|
| 230 |
+
"fleet.gpu_spec_unknown": {
|
| 231 |
+
"en": "Fleet planning skipped — GPU spec unknown.",
|
| 232 |
+
"zh": "GPU 规格未知,跳过 fleet 规划。",
|
| 233 |
+
},
|
| 234 |
+
# Command generator
|
| 235 |
+
"section.command": {
|
| 236 |
+
"en": "Generated command",
|
| 237 |
+
"zh": "生成的启动命令",
|
| 238 |
+
},
|
| 239 |
+
"command.tier_note": {
|
| 240 |
+
"en": "tier: {tier} ({gpus} GPUs)",
|
| 241 |
+
"zh": "档位:{tier}({gpus} 张)",
|
| 242 |
+
},
|
| 243 |
+
# Performance section
|
| 244 |
+
"section.performance": {
|
| 245 |
+
"en": "Performance analysis",
|
| 246 |
+
"zh": "性能分析",
|
| 247 |
+
},
|
| 248 |
+
"perf.assumptions_note": {
|
| 249 |
+
"en": (
|
| 250 |
+
"Assumes input={input_tokens} tokens, output={output_tokens} tokens, "
|
| 251 |
+
"target {target_tps} tok/s per user. "
|
| 252 |
+
"Utilization: prefill={prefill_util:.0%} / decode_bw={decode_util:.0%} "
|
| 253 |
+
"/ concurrency_degradation={degradation:.2f}x. "
|
| 254 |
+
"All numbers are [estimated] — see docs/methodology.md for formula sources "
|
| 255 |
+
"and override via --prefill-util / --decode-bw-util / --concurrency-degradation."
|
| 256 |
+
),
|
| 257 |
+
"zh": (
|
| 258 |
+
"假设输入 {input_tokens} tokens、输出 {output_tokens} tokens、"
|
| 259 |
+
"每用户目标 {target_tps} tok/s。"
|
| 260 |
+
"利用率:prefill={prefill_util:.0%} / decode_bw={decode_util:.0%} "
|
| 261 |
+
"/ 并发退化={degradation:.2f}x。"
|
| 262 |
+
"所有数字都是 [估算]——公式来源见 docs/methodology.md,"
|
| 263 |
+
"可通过 --prefill-util / --decode-bw-util / --concurrency-degradation 覆盖。"
|
| 264 |
+
),
|
| 265 |
+
},
|
| 266 |
+
"perf.prefill_latency": {
|
| 267 |
+
"en": "Prefill latency (single request)",
|
| 268 |
+
"zh": "Prefill 延迟(单请求)",
|
| 269 |
+
},
|
| 270 |
+
"perf.decode_throughput_cluster": {
|
| 271 |
+
"en": "Decode throughput (cluster)",
|
| 272 |
+
"zh": "Decode 吞吐(集群)",
|
| 273 |
+
},
|
| 274 |
+
"perf.decode_throughput_per_gpu": {
|
| 275 |
+
"en": "Decode throughput (per GPU)",
|
| 276 |
+
"zh": "Decode 吞吐(单卡)",
|
| 277 |
+
},
|
| 278 |
+
"perf.decode_moe_active_optimistic": {
|
| 279 |
+
"en": "Decode throughput (MoE active-only, optimistic)",
|
| 280 |
+
"zh": "Decode 吞吐(MoE 仅激活专家,乐观估算)",
|
| 281 |
+
},
|
| 282 |
+
"perf.k_bound": {
|
| 283 |
+
"en": "K bound (memory-capacity)",
|
| 284 |
+
"zh": "K 上限(显存容量)",
|
| 285 |
+
},
|
| 286 |
+
"perf.l_bound": {
|
| 287 |
+
"en": "L bound (compute / bandwidth @ SLA)",
|
| 288 |
+
"zh": "L 上限(算力/带宽 @ SLA)",
|
| 289 |
+
},
|
| 290 |
+
"perf.max_concurrent": {
|
| 291 |
+
"en": "Max concurrent",
|
| 292 |
+
"zh": "最大并发",
|
| 293 |
+
},
|
| 294 |
+
"perf.bottleneck": {
|
| 295 |
+
"en": "Bottleneck",
|
| 296 |
+
"zh": "瓶颈类型",
|
| 297 |
+
},
|
| 298 |
+
"perf.bottleneck.memory_capacity": {
|
| 299 |
+
"en": "Memory capacity",
|
| 300 |
+
"zh": "显存容量",
|
| 301 |
+
},
|
| 302 |
+
"perf.bottleneck.memory_bandwidth": {
|
| 303 |
+
"en": "Memory bandwidth / compute",
|
| 304 |
+
"zh": "显存带宽 / 算力",
|
| 305 |
+
},
|
| 306 |
+
"perf.bottleneck.compute": {
|
| 307 |
+
"en": "Compute",
|
| 308 |
+
"zh": "算力",
|
| 309 |
+
},
|
| 310 |
+
"perf.bottleneck.insufficient_data": {
|
| 311 |
+
"en": "Insufficient data",
|
| 312 |
+
"zh": "数据不足",
|
| 313 |
+
},
|
| 314 |
+
"perf.optimization.header": {
|
| 315 |
+
"en": "Optimization suggestions",
|
| 316 |
+
"zh": "优化建议",
|
| 317 |
+
},
|
| 318 |
+
"perf.opt.quantize_int4": {
|
| 319 |
+
"en": "Quantize to INT4: weight bytes halve → decode tok/s roughly 2× → concurrency scales accordingly.",
|
| 320 |
+
"zh": "量化到 INT4:权重字节减半 → decode tok/s 约翻倍 → 并发能力随之提升。",
|
| 321 |
+
},
|
| 322 |
+
"perf.opt.relax_sla": {
|
| 323 |
+
"en": "Relax SLA: if per-user target drops to 15 tok/s, L bound roughly doubles.",
|
| 324 |
+
"zh": "放宽 SLA:若每用户目标降至 15 tok/s,L 上限约翻倍。",
|
| 325 |
+
},
|
| 326 |
+
"perf.opt.kv_fp8": {
|
| 327 |
+
"en": "KV cache FP8 quantization: halves per-request KV, doubles the K bound at long context.",
|
| 328 |
+
"zh": "KV cache 量化到 FP8:单请求 KV 减半,长上下文下 K 上限约翻倍。",
|
| 329 |
+
},
|
| 330 |
+
"perf.opt.moe_offload": {
|
| 331 |
+
"en": "MoE expert offload to CPU: frees HBM for more KV cache at the cost of PCIe latency per new expert.",
|
| 332 |
+
"zh": "MoE 专家卸载到 CPU:释放 HBM 给 KV cache,代价是新专家激活时的 PCIe 延迟。",
|
| 333 |
+
},
|
| 334 |
+
# Explain section
|
| 335 |
+
"section.explain": {
|
| 336 |
+
"en": "Full derivation traces (--explain)",
|
| 337 |
+
"zh": "完整推导链(--explain)",
|
| 338 |
+
},
|
| 339 |
+
"explain.formula": {"en": "Formula", "zh": "公式"},
|
| 340 |
+
"explain.inputs": {"en": "Inputs", "zh": "输入"},
|
| 341 |
+
"explain.steps": {"en": "Computation", "zh": "计算步骤"},
|
| 342 |
+
"explain.result": {"en": "Result", "zh": "结果"},
|
| 343 |
+
"explain.source": {"en": "Source", "zh": "来源"},
|
| 344 |
+
"explain.see_also": {"en": "See also", "zh": "延伸阅读"},
|
| 345 |
+
"explain.intro": {
|
| 346 |
+
"en": (
|
| 347 |
+
"Each entry below shows the formula used, the inputs that went in, "
|
| 348 |
+
"every computation step, and the primary source. "
|
| 349 |
+
"Paste any single entry into an LLM and ask 'does this math check out?' "
|
| 350 |
+
"— the tool stays deterministic, the second opinion is yours."
|
| 351 |
+
),
|
| 352 |
+
"zh": (
|
| 353 |
+
"下面每一项都给出所用公式、输入、每一步计算、主要来源。"
|
| 354 |
+
"把任一项复制粘贴给 LLM,问『这个推理对吗』即可。"
|
| 355 |
+
"工具保持确定性,second opinion 交给你。"
|
| 356 |
+
),
|
| 357 |
+
},
|
| 358 |
+
# LLM review section
|
| 359 |
+
"section.llm_review": {
|
| 360 |
+
"en": "LLM second opinion (--llm-review, EXPERIMENTAL)",
|
| 361 |
+
"zh": "LLM 审阅(--llm-review,实验性)",
|
| 362 |
+
},
|
| 363 |
+
"llm_review.disclaimer": {
|
| 364 |
+
"en": (
|
| 365 |
+
"⚠ This is a second opinion from an external LLM ({model} via {base_url}). "
|
| 366 |
+
"It is tagged [llm-opinion] and NEVER overrides the 6 primary labels. "
|
| 367 |
+
"LLMs can be wrong; the tool's deterministic output takes precedence."
|
| 368 |
+
),
|
| 369 |
+
"zh": (
|
| 370 |
+
"⚠ 以下是来自外部 LLM({model},经 {base_url})的第二意见。"
|
| 371 |
+
"标签为 [LLM 观点],**永远不覆盖** 前 6 级主标签。"
|
| 372 |
+
"LLM 可能出错;工具的确定性输出优先。"
|
| 373 |
+
),
|
| 374 |
+
},
|
| 375 |
+
"llm_review.unavailable": {
|
| 376 |
+
"en": "LLM review unavailable: {error}",
|
| 377 |
+
"zh": "LLM 审阅不可用:{error}",
|
| 378 |
+
},
|
| 379 |
+
"llm_review.setup_hint": {
|
| 380 |
+
"en": (
|
| 381 |
+
"To enable: export LLM_CAL_REVIEWER_API_KEY=<key> "
|
| 382 |
+
"[optional: LLM_CAL_REVIEWER_BASE_URL, LLM_CAL_REVIEWER_MODEL]"
|
| 383 |
+
),
|
| 384 |
+
"zh": (
|
| 385 |
+
"启用方法:export LLM_CAL_REVIEWER_API_KEY=<key> "
|
| 386 |
+
"[可选:LLM_CAL_REVIEWER_BASE_URL、LLM_CAL_REVIEWER_MODEL]"
|
| 387 |
+
),
|
| 388 |
+
},
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
def set_locale(loc: Locale) -> None:
|
| 393 |
+
global _current_locale
|
| 394 |
+
_current_locale = loc
|
| 395 |
+
|
| 396 |
+
|
| 397 |
+
def get_locale() -> Locale:
|
| 398 |
+
return _current_locale
|
| 399 |
+
|
| 400 |
+
|
| 401 |
+
def detect_locale_from_env() -> Locale:
|
| 402 |
+
"""Auto-detect from standard locale env vars."""
|
| 403 |
+
for var in ("LC_ALL", "LC_MESSAGES", "LANG"):
|
| 404 |
+
val = os.environ.get(var, "").lower()
|
| 405 |
+
if val.startswith("zh"):
|
| 406 |
+
return "zh"
|
| 407 |
+
return "en"
|
| 408 |
+
|
| 409 |
+
|
| 410 |
+
def t(key: str, **kwargs: object) -> str:
|
| 411 |
+
"""Translate a message key. Unknown keys return the key itself (fail loud)."""
|
| 412 |
+
bundle = _MESSAGES.get(key)
|
| 413 |
+
if bundle is None:
|
| 414 |
+
return key
|
| 415 |
+
template = bundle.get(_current_locale, bundle.get("en", key))
|
| 416 |
+
if kwargs:
|
| 417 |
+
try:
|
| 418 |
+
return template.format(**kwargs)
|
| 419 |
+
except (KeyError, IndexError):
|
| 420 |
+
return template
|
| 421 |
+
return template
|
src/llm_cal/common/yaml_loader.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pydantic-validated YAML loader.
|
| 2 |
+
|
| 3 |
+
Shared between engine_compat and hardware modules. Supports `lazy=True` param
|
| 4 |
+
(v0.1 does not implement lazy — signature reserved for v0.2 when matrix > 100).
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import TypeVar
|
| 11 |
+
|
| 12 |
+
import yaml
|
| 13 |
+
from pydantic import BaseModel, ValidationError
|
| 14 |
+
|
| 15 |
+
T = TypeVar("T", bound=BaseModel)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class YamlLoadError(Exception):
|
| 19 |
+
"""YAML file could not be parsed or validated."""
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def load_yaml(path: str | Path, schema: type[T], *, lazy: bool = False) -> T:
|
| 23 |
+
"""Load + validate a YAML file against a Pydantic schema.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
path: YAML file to load.
|
| 27 |
+
schema: Pydantic model the YAML is expected to conform to.
|
| 28 |
+
lazy: Reserved for v0.2 (on-demand loading of large matrices). v0.1
|
| 29 |
+
ignores this; document-scale data is small enough that eager
|
| 30 |
+
loading is fine.
|
| 31 |
+
"""
|
| 32 |
+
_ = lazy # v0.1 behavior is always eager
|
| 33 |
+
p = Path(path)
|
| 34 |
+
if not p.exists():
|
| 35 |
+
raise YamlLoadError(f"YAML file not found: {p}")
|
| 36 |
+
try:
|
| 37 |
+
with p.open("r", encoding="utf-8") as f:
|
| 38 |
+
raw = yaml.safe_load(f)
|
| 39 |
+
except yaml.YAMLError as e:
|
| 40 |
+
raise YamlLoadError(f"YAML parse error in {p}: {e}") from e
|
| 41 |
+
|
| 42 |
+
if raw is None:
|
| 43 |
+
raise YamlLoadError(f"YAML file {p} is empty")
|
| 44 |
+
|
| 45 |
+
try:
|
| 46 |
+
return schema.model_validate(raw)
|
| 47 |
+
except ValidationError as e:
|
| 48 |
+
raise YamlLoadError(f"Schema validation failed for {p}:\n{e}") from e
|
src/llm_cal/core/__init__.py
ADDED
|
File without changes
|
src/llm_cal/core/cache.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Disk cache for model-source responses.
|
| 2 |
+
|
| 3 |
+
Key design decisions (from /plan-eng-review Issue #2 + Issue #10 critical):
|
| 4 |
+
|
| 5 |
+
- Key = (source, model_id, commit_sha). Commit sha is included so a repo update
|
| 6 |
+
invalidates cache automatically — prevents the critical regression of serving
|
| 7 |
+
stale data after the upstream model updates.
|
| 8 |
+
- TTL = 7 days default. Even without a commit change, we force re-fetch weekly.
|
| 9 |
+
- `--refresh` flag sets `bypass=True` on `get()` — caller drives it.
|
| 10 |
+
- Store location: platformdirs user cache dir, subdirectory `llm-cal`.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from __future__ import annotations
|
| 14 |
+
|
| 15 |
+
from dataclasses import asdict, dataclass, is_dataclass
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from typing import Any
|
| 18 |
+
|
| 19 |
+
import diskcache
|
| 20 |
+
from platformdirs import user_cache_dir
|
| 21 |
+
|
| 22 |
+
from llm_cal.model_source.base import ModelArtifact, SiblingFile
|
| 23 |
+
|
| 24 |
+
_DEFAULT_TTL_SECONDS = 7 * 24 * 60 * 60 # 7 days
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
@dataclass(frozen=True)
|
| 28 |
+
class CacheKey:
|
| 29 |
+
source: str
|
| 30 |
+
model_id: str
|
| 31 |
+
commit_sha: str | None
|
| 32 |
+
|
| 33 |
+
def to_string(self) -> str:
|
| 34 |
+
return f"{self.source}::{self.model_id}::{self.commit_sha or 'HEAD'}"
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class ArtifactCache:
|
| 38 |
+
"""Persistent cache for ModelArtifact instances."""
|
| 39 |
+
|
| 40 |
+
def __init__(
|
| 41 |
+
self, cache_dir: str | Path | None = None, ttl_seconds: int = _DEFAULT_TTL_SECONDS
|
| 42 |
+
) -> None:
|
| 43 |
+
if cache_dir is None:
|
| 44 |
+
cache_dir = user_cache_dir("llm-cal", appauthor=False)
|
| 45 |
+
Path(cache_dir).mkdir(parents=True, exist_ok=True)
|
| 46 |
+
self._cache = diskcache.Cache(str(cache_dir))
|
| 47 |
+
self._ttl = ttl_seconds
|
| 48 |
+
|
| 49 |
+
def get(self, key: CacheKey, bypass: bool = False) -> ModelArtifact | None:
|
| 50 |
+
"""Look up an artifact. `bypass=True` always returns None (used by --refresh).
|
| 51 |
+
|
| 52 |
+
If `key.commit_sha` is None (no revision pinning), we never serve from cache
|
| 53 |
+
because we can't prove freshness.
|
| 54 |
+
"""
|
| 55 |
+
if bypass or key.commit_sha is None:
|
| 56 |
+
return None
|
| 57 |
+
raw = self._cache.get(key.to_string())
|
| 58 |
+
if raw is None:
|
| 59 |
+
return None
|
| 60 |
+
return _deserialize_artifact(raw)
|
| 61 |
+
|
| 62 |
+
def set(self, key: CacheKey, artifact: ModelArtifact) -> None:
|
| 63 |
+
"""Cache an artifact. No-op if commit_sha is None (can't guarantee freshness)."""
|
| 64 |
+
if key.commit_sha is None:
|
| 65 |
+
return
|
| 66 |
+
self._cache.set(key.to_string(), _serialize_artifact(artifact), expire=self._ttl)
|
| 67 |
+
|
| 68 |
+
def invalidate(self, key: CacheKey) -> bool:
|
| 69 |
+
"""Explicit invalidation, returns True if something was removed."""
|
| 70 |
+
return bool(self._cache.delete(key.to_string()))
|
| 71 |
+
|
| 72 |
+
def clear(self) -> None:
|
| 73 |
+
"""Wipe the whole cache — for tests and `llm-cal cache clear` (future)."""
|
| 74 |
+
self._cache.clear()
|
| 75 |
+
|
| 76 |
+
def close(self) -> None:
|
| 77 |
+
self._cache.close()
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def _serialize_artifact(a: ModelArtifact) -> dict[str, Any]:
|
| 81 |
+
return {
|
| 82 |
+
"source": a.source,
|
| 83 |
+
"model_id": a.model_id,
|
| 84 |
+
"commit_sha": a.commit_sha,
|
| 85 |
+
"config": a.config,
|
| 86 |
+
"siblings": [asdict(s) if is_dataclass(s) else s for s in a.siblings],
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def _deserialize_artifact(raw: dict[str, Any]) -> ModelArtifact:
|
| 91 |
+
return ModelArtifact(
|
| 92 |
+
source=raw["source"],
|
| 93 |
+
model_id=raw["model_id"],
|
| 94 |
+
commit_sha=raw["commit_sha"],
|
| 95 |
+
config=raw["config"],
|
| 96 |
+
siblings=tuple(SiblingFile(**s) for s in raw["siblings"]),
|
| 97 |
+
)
|
src/llm_cal/core/evaluator.py
ADDED
|
@@ -0,0 +1,375 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Evaluator — the single orchestration layer.
|
| 2 |
+
|
| 3 |
+
v0.1 partial implementation: composes model_source + detector + weight_analyzer
|
| 4 |
+
+ reconciler + kv_cache + engine_compat + hardware. Fleet planner and command
|
| 5 |
+
generator land in Week 5 remainder.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
from dataclasses import dataclass, field
|
| 11 |
+
|
| 12 |
+
from llm_cal.architecture.detector import detect
|
| 13 |
+
from llm_cal.architecture.formulas.kv_cache import compute_kv_cache_bytes
|
| 14 |
+
from llm_cal.architecture.formulas.weight import estimate_total_params
|
| 15 |
+
from llm_cal.architecture.profile import ArchitectureProfile
|
| 16 |
+
from llm_cal.command_generator.sglang import generate_sglang_command
|
| 17 |
+
from llm_cal.command_generator.vllm import generate_vllm_command
|
| 18 |
+
from llm_cal.core.cache import ArtifactCache, CacheKey
|
| 19 |
+
from llm_cal.engine_compat.loader import EngineCompatEntry, find_match
|
| 20 |
+
from llm_cal.fleet.planner import FleetRecommendation, plan
|
| 21 |
+
from llm_cal.hardware.loader import GPUSpec, UnknownGPUError, lookup
|
| 22 |
+
from llm_cal.model_source.base import ModelArtifact, ModelSource
|
| 23 |
+
from llm_cal.model_source.huggingface import HuggingFaceSource
|
| 24 |
+
from llm_cal.output.labels import AnnotatedValue
|
| 25 |
+
from llm_cal.performance.compute import (
|
| 26 |
+
DEFAULT_DECODE_BW_UTILIZATION,
|
| 27 |
+
DEFAULT_PREFILL_UTILIZATION,
|
| 28 |
+
DecodeEstimate,
|
| 29 |
+
PrefillEstimate,
|
| 30 |
+
estimate_decode,
|
| 31 |
+
estimate_prefill,
|
| 32 |
+
)
|
| 33 |
+
from llm_cal.performance.concurrency import ConcurrencyAnalysis
|
| 34 |
+
from llm_cal.performance.concurrency import analyze as analyze_concurrency
|
| 35 |
+
from llm_cal.weight_analyzer import WeightReport, analyze
|
| 36 |
+
from llm_cal.weight_analyzer.fingerprint import (
|
| 37 |
+
QuantFingerprint,
|
| 38 |
+
from_config,
|
| 39 |
+
from_safetensors_dtypes,
|
| 40 |
+
)
|
| 41 |
+
from llm_cal.weight_analyzer.reconciler import ReconciliationReport, reconcile
|
| 42 |
+
from llm_cal.weight_analyzer.safetensors_reader import (
|
| 43 |
+
fetch_tensor_dtypes,
|
| 44 |
+
pick_sample_shard,
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
_KV_REFERENCE_CTX = 131_072 # matches fleet.planner's _REFERENCE_CTX_TOKENS
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
@dataclass(frozen=True)
|
| 51 |
+
class EvaluationReport:
|
| 52 |
+
"""Everything the evaluator produces for one model."""
|
| 53 |
+
|
| 54 |
+
model_id: str
|
| 55 |
+
source: str
|
| 56 |
+
commit_sha: str | None
|
| 57 |
+
gpu: str
|
| 58 |
+
gpu_spec: GPUSpec | None
|
| 59 |
+
gpu_error: str | None # message if gpu wasn't found
|
| 60 |
+
engine: str
|
| 61 |
+
profile: ArchitectureProfile
|
| 62 |
+
weight: WeightReport
|
| 63 |
+
total_params_estimate: AnnotatedValue[int]
|
| 64 |
+
reconciliation: ReconciliationReport
|
| 65 |
+
kv_cache_by_context: dict[int, AnnotatedValue[int]] = field(default_factory=dict)
|
| 66 |
+
engine_match: EngineCompatEntry | None = None
|
| 67 |
+
fleet: FleetRecommendation | None = None
|
| 68 |
+
generated_command: str | None = None
|
| 69 |
+
# Performance analysis — filled when user passes SLA args (or defaults).
|
| 70 |
+
prefill: PrefillEstimate | None = None
|
| 71 |
+
decode: DecodeEstimate | None = None
|
| 72 |
+
concurrency: ConcurrencyAnalysis | None = None
|
| 73 |
+
perf_input_tokens: int | None = None
|
| 74 |
+
perf_output_tokens: int | None = None
|
| 75 |
+
perf_target_tokens_per_sec: float | None = None
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
class Evaluator:
|
| 79 |
+
"""Orchestrates: model_source -> detect -> analyze -> reconcile -> KV cache
|
| 80 |
+
-> engine compat -> hardware lookup.
|
| 81 |
+
|
| 82 |
+
Fleet planning and command generation are remaining Week 5 additions.
|
| 83 |
+
"""
|
| 84 |
+
|
| 85 |
+
def __init__(
|
| 86 |
+
self,
|
| 87 |
+
source: ModelSource | None = None,
|
| 88 |
+
cache: ArtifactCache | None = None,
|
| 89 |
+
) -> None:
|
| 90 |
+
self._source = source or HuggingFaceSource()
|
| 91 |
+
self._cache = cache or ArtifactCache()
|
| 92 |
+
|
| 93 |
+
def evaluate(
|
| 94 |
+
self,
|
| 95 |
+
model_id: str,
|
| 96 |
+
gpu: str,
|
| 97 |
+
engine: str,
|
| 98 |
+
gpu_count: int | None = None,
|
| 99 |
+
context_length: int | None = None,
|
| 100 |
+
refresh: bool = False,
|
| 101 |
+
input_tokens: int | None = None,
|
| 102 |
+
output_tokens: int | None = None,
|
| 103 |
+
target_tokens_per_sec: float | None = None,
|
| 104 |
+
prefill_utilization: float = DEFAULT_PREFILL_UTILIZATION,
|
| 105 |
+
decode_bw_utilization: float = DEFAULT_DECODE_BW_UTILIZATION,
|
| 106 |
+
concurrency_degradation: float = 1.0,
|
| 107 |
+
) -> EvaluationReport:
|
| 108 |
+
artifact = self._fetch(model_id, refresh=refresh)
|
| 109 |
+
profile = detect(artifact.config)
|
| 110 |
+
|
| 111 |
+
total_params_est = estimate_total_params(profile)
|
| 112 |
+
total_params = total_params_est.value
|
| 113 |
+
|
| 114 |
+
observed_bytes_for_fp = sum(
|
| 115 |
+
(s.size or 0) for s in artifact.siblings if s.filename.endswith(".safetensors")
|
| 116 |
+
)
|
| 117 |
+
fingerprint = self._resolve_quant_fingerprint(
|
| 118 |
+
artifact,
|
| 119 |
+
observed_bytes=observed_bytes_for_fp,
|
| 120 |
+
total_params=total_params if total_params > 0 else 0,
|
| 121 |
+
)
|
| 122 |
+
weight = analyze(
|
| 123 |
+
artifact.siblings,
|
| 124 |
+
total_params=total_params if total_params > 0 else None,
|
| 125 |
+
fingerprint=fingerprint,
|
| 126 |
+
)
|
| 127 |
+
reconciliation = reconcile(
|
| 128 |
+
weight.total_bytes.value,
|
| 129 |
+
total_params or 1,
|
| 130 |
+
fingerprint=fingerprint,
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
contexts_to_report = self._select_context_lengths(profile, context_length)
|
| 134 |
+
kv_by_ctx = {
|
| 135 |
+
ctx: compute_kv_cache_bytes(profile, seq_len=ctx, dtype_bytes=2)
|
| 136 |
+
for ctx in contexts_to_report
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
# Engine compatibility — match by model_type alone (v0.1). Version
|
| 140 |
+
# filtering can be added via a future --engine-version flag.
|
| 141 |
+
engine_match = find_match(engine=engine, model_type=profile.model_type)
|
| 142 |
+
|
| 143 |
+
# Hardware lookup — never raises out to CLI, we embed the error message
|
| 144 |
+
# so the user sees a partial report instead of aborting.
|
| 145 |
+
gpu_spec: GPUSpec | None = None
|
| 146 |
+
gpu_error: str | None = None
|
| 147 |
+
try:
|
| 148 |
+
gpu_spec = lookup(gpu)
|
| 149 |
+
except UnknownGPUError as e:
|
| 150 |
+
gpu_error = str(e)
|
| 151 |
+
|
| 152 |
+
# Fleet planning — only if we have a known GPU. The planner's reference
|
| 153 |
+
# context is 128K; derive KV bytes there (computing fresh in case the
|
| 154 |
+
# user chose a non-overlapping context_length override).
|
| 155 |
+
fleet: FleetRecommendation | None = None
|
| 156 |
+
generated_command: str | None = None
|
| 157 |
+
if gpu_spec is not None and weight.total_bytes.value > 0:
|
| 158 |
+
kv_ref = compute_kv_cache_bytes(profile, _KV_REFERENCE_CTX, dtype_bytes=2)
|
| 159 |
+
kv_by_context_bytes = {ctx: av.value for ctx, av in kv_by_ctx.items() if av.value > 0}
|
| 160 |
+
fleet = plan(
|
| 161 |
+
profile=profile,
|
| 162 |
+
weight_bytes=weight.total_bytes.value,
|
| 163 |
+
kv_bytes_per_request_at_ref=max(1, kv_ref.value),
|
| 164 |
+
gpu=gpu_spec,
|
| 165 |
+
forced_gpu_count=gpu_count,
|
| 166 |
+
kv_bytes_by_context=kv_by_context_bytes,
|
| 167 |
+
)
|
| 168 |
+
# Pick the gpu_count to emit the command for: user's forced value,
|
| 169 |
+
# else the best_tier's recommendation.
|
| 170 |
+
chosen_count = gpu_count or next(
|
| 171 |
+
(o.gpu_count for o in fleet.options if o.tier == fleet.best_tier),
|
| 172 |
+
fleet.options[0].gpu_count,
|
| 173 |
+
)
|
| 174 |
+
generated_command = _generate_command(
|
| 175 |
+
engine=engine,
|
| 176 |
+
model_id=model_id,
|
| 177 |
+
profile=profile,
|
| 178 |
+
tp=chosen_count,
|
| 179 |
+
entry=engine_match,
|
| 180 |
+
max_model_len=context_length,
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
# Performance analysis — runs whenever we have hardware + fleet.
|
| 184 |
+
prefill_est: PrefillEstimate | None = None
|
| 185 |
+
decode_est: DecodeEstimate | None = None
|
| 186 |
+
concurrency_est: ConcurrencyAnalysis | None = None
|
| 187 |
+
if gpu_spec is not None and fleet is not None and total_params > 0:
|
| 188 |
+
# Pick the fleet tier we're analyzing (user's forced count or best tier).
|
| 189 |
+
chosen = gpu_count or next(
|
| 190 |
+
(o.gpu_count for o in fleet.options if o.tier == fleet.best_tier),
|
| 191 |
+
fleet.options[0].gpu_count,
|
| 192 |
+
)
|
| 193 |
+
# Resolve performance defaults when user didn't specify.
|
| 194 |
+
eff_input = input_tokens or 2000
|
| 195 |
+
eff_target = target_tokens_per_sec or 30.0
|
| 196 |
+
|
| 197 |
+
prefill_est = estimate_prefill(
|
| 198 |
+
profile=profile,
|
| 199 |
+
total_params=total_params,
|
| 200 |
+
gpu=gpu_spec,
|
| 201 |
+
num_gpus=chosen,
|
| 202 |
+
input_tokens=eff_input,
|
| 203 |
+
utilization=prefill_utilization,
|
| 204 |
+
)
|
| 205 |
+
# MoE active ratio: active/total = (shared + experts_per_tok) / (shared + routed)
|
| 206 |
+
moe_active_ratio: float | None = None
|
| 207 |
+
if profile.moe is not None:
|
| 208 |
+
active_experts = profile.moe.num_experts_per_tok + profile.moe.num_shared_experts
|
| 209 |
+
total_experts = profile.moe.num_routed_experts + profile.moe.num_shared_experts
|
| 210 |
+
if total_experts > 0:
|
| 211 |
+
moe_active_ratio = active_experts / total_experts
|
| 212 |
+
decode_est = estimate_decode(
|
| 213 |
+
profile=profile,
|
| 214 |
+
total_weight_bytes=weight.total_bytes.value,
|
| 215 |
+
gpu=gpu_spec,
|
| 216 |
+
num_gpus=chosen,
|
| 217 |
+
bw_utilization=decode_bw_utilization,
|
| 218 |
+
moe_active_params_ratio=moe_active_ratio,
|
| 219 |
+
)
|
| 220 |
+
# Compute cluster headroom at the chosen tier + KV per request at the
|
| 221 |
+
# *longest* surveyed context (most conservative).
|
| 222 |
+
chosen_option = next(
|
| 223 |
+
(o for o in fleet.options if o.gpu_count == chosen),
|
| 224 |
+
fleet.options[-1],
|
| 225 |
+
)
|
| 226 |
+
headroom_per_gpu = (
|
| 227 |
+
chosen_option.usable_bytes_per_gpu - chosen_option.weight_bytes_per_gpu
|
| 228 |
+
)
|
| 229 |
+
# Cluster-wide headroom is per-GPU * N; currently we use per-GPU view below.
|
| 230 |
+
# Reference context for the L bound: match K's headroom context (128K
|
| 231 |
+
# if model supports it, else max).
|
| 232 |
+
kv_ref_ctx = 131_072 if 131_072 in kv_by_ctx else max(kv_by_ctx.keys())
|
| 233 |
+
kv_ref_bytes: int = kv_by_ctx[kv_ref_ctx].value
|
| 234 |
+
# Apply TP-aware sharding (same rule fleet planner uses).
|
| 235 |
+
from llm_cal.fleet.planner import _kv_shards
|
| 236 |
+
|
| 237 |
+
shards = _kv_shards(profile, chosen)
|
| 238 |
+
kv_ref_per_gpu = max(1, kv_ref_bytes // shards)
|
| 239 |
+
# Request KV lives per-GPU; under replication, it's the same value on all.
|
| 240 |
+
# We compare cluster headroom against per-GPU KV (each request consumes
|
| 241 |
+
# per-GPU KV on every rank simultaneously).
|
| 242 |
+
# To convert to "how many requests fit", we divide *per-GPU* headroom
|
| 243 |
+
# by *per-GPU* KV.
|
| 244 |
+
headroom_per_req_view = max(0, headroom_per_gpu)
|
| 245 |
+
concurrency_est = analyze_concurrency(
|
| 246 |
+
cluster_headroom_bytes=headroom_per_req_view,
|
| 247 |
+
kv_bytes_per_request=kv_ref_per_gpu,
|
| 248 |
+
decode=decode_est,
|
| 249 |
+
target_tokens_per_sec=eff_target,
|
| 250 |
+
degradation=concurrency_degradation,
|
| 251 |
+
)
|
| 252 |
+
|
| 253 |
+
return EvaluationReport(
|
| 254 |
+
model_id=model_id,
|
| 255 |
+
source=artifact.source,
|
| 256 |
+
commit_sha=artifact.commit_sha,
|
| 257 |
+
gpu=gpu,
|
| 258 |
+
gpu_spec=gpu_spec,
|
| 259 |
+
gpu_error=gpu_error,
|
| 260 |
+
engine=engine,
|
| 261 |
+
profile=profile,
|
| 262 |
+
weight=weight,
|
| 263 |
+
total_params_estimate=total_params_est,
|
| 264 |
+
reconciliation=reconciliation,
|
| 265 |
+
kv_cache_by_context=kv_by_ctx,
|
| 266 |
+
engine_match=engine_match,
|
| 267 |
+
fleet=fleet,
|
| 268 |
+
generated_command=generated_command,
|
| 269 |
+
prefill=prefill_est,
|
| 270 |
+
decode=decode_est,
|
| 271 |
+
concurrency=concurrency_est,
|
| 272 |
+
perf_input_tokens=input_tokens or 2000 if fleet else None,
|
| 273 |
+
perf_output_tokens=output_tokens or 512 if fleet else None,
|
| 274 |
+
perf_target_tokens_per_sec=target_tokens_per_sec or 30.0 if fleet else None,
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
def _fetch(self, model_id: str, refresh: bool) -> ModelArtifact:
|
| 278 |
+
artifact = self._source.fetch(model_id)
|
| 279 |
+
key = CacheKey(
|
| 280 |
+
source=self._source.name,
|
| 281 |
+
model_id=model_id,
|
| 282 |
+
commit_sha=artifact.commit_sha,
|
| 283 |
+
)
|
| 284 |
+
cached = self._cache.get(key, bypass=refresh)
|
| 285 |
+
if cached is not None:
|
| 286 |
+
return cached
|
| 287 |
+
self._cache.set(key, artifact)
|
| 288 |
+
return artifact
|
| 289 |
+
|
| 290 |
+
def _resolve_quant_fingerprint(
|
| 291 |
+
self,
|
| 292 |
+
artifact: ModelArtifact,
|
| 293 |
+
observed_bytes: int,
|
| 294 |
+
total_params: int,
|
| 295 |
+
) -> QuantFingerprint | None:
|
| 296 |
+
"""Resolve the quantization scheme via authoritative evidence.
|
| 297 |
+
|
| 298 |
+
Priority:
|
| 299 |
+
1. config.json `quantization_config` — explicit author declaration.
|
| 300 |
+
Free, no extra network call. But if its predicted bytes are
|
| 301 |
+
wildly off (>15% from observed), fall through — config.json
|
| 302 |
+
can be incomplete or stale (DeepSeek-V4-Flash declares
|
| 303 |
+
`quant_method=fp8` but ships an FP4+FP8 mixed pack; trusting
|
| 304 |
+
the declaration produces a 45% wrong answer).
|
| 305 |
+
2. safetensors file header — per-tensor dtype fingerprint. One
|
| 306 |
+
Range GET on the first shard. Ground truth.
|
| 307 |
+
|
| 308 |
+
Returns None on any failure. The reconciler falls back to bytes-only
|
| 309 |
+
argmin in that case (v0.1.1 behavior).
|
| 310 |
+
"""
|
| 311 |
+
fp = from_config(artifact.config)
|
| 312 |
+
if fp is not None and self._fingerprint_matches_bytes(fp, observed_bytes, total_params):
|
| 313 |
+
return fp
|
| 314 |
+
|
| 315 |
+
shard = pick_sample_shard(artifact.siblings)
|
| 316 |
+
if shard is None:
|
| 317 |
+
return fp # safetensors unavailable — best we can do is the config hint
|
| 318 |
+
|
| 319 |
+
dtypes = fetch_tensor_dtypes(
|
| 320 |
+
source=artifact.source,
|
| 321 |
+
model_id=artifact.model_id,
|
| 322 |
+
revision=artifact.commit_sha or "main",
|
| 323 |
+
shard_filename=shard.filename,
|
| 324 |
+
)
|
| 325 |
+
if not dtypes:
|
| 326 |
+
return fp
|
| 327 |
+
|
| 328 |
+
st_fp = from_safetensors_dtypes(dtypes)
|
| 329 |
+
# Header is ground truth — prefer it over config when both exist.
|
| 330 |
+
return st_fp if st_fp is not None else fp
|
| 331 |
+
|
| 332 |
+
@staticmethod
|
| 333 |
+
def _fingerprint_matches_bytes(
|
| 334 |
+
fp: QuantFingerprint, observed_bytes: int, total_params: int
|
| 335 |
+
) -> bool:
|
| 336 |
+
"""Sanity-check a fingerprint's predicted bytes against observed.
|
| 337 |
+
|
| 338 |
+
Returns True if the declared scheme's predicted bytes are within 15%
|
| 339 |
+
of observed. False means config.json is either lying or describes
|
| 340 |
+
only part of the model — we should consult safetensors instead.
|
| 341 |
+
"""
|
| 342 |
+
from llm_cal.weight_analyzer import _QUANT_BPP
|
| 343 |
+
|
| 344 |
+
bpp = _QUANT_BPP.get(fp.scheme, 0.0)
|
| 345 |
+
if bpp <= 0 or total_params <= 0 or observed_bytes <= 0:
|
| 346 |
+
return True # can't verify — don't penalize the fingerprint
|
| 347 |
+
predicted = bpp * total_params
|
| 348 |
+
rel_err = abs(observed_bytes - predicted) / predicted
|
| 349 |
+
return rel_err <= 0.15
|
| 350 |
+
|
| 351 |
+
@staticmethod
|
| 352 |
+
def _select_context_lengths(profile: ArchitectureProfile, override: int | None) -> list[int]:
|
| 353 |
+
if override is not None:
|
| 354 |
+
return [override]
|
| 355 |
+
candidates = [4_096, 32_768, 131_072]
|
| 356 |
+
max_pos = profile.position.max_position_embeddings if profile.position else None
|
| 357 |
+
if max_pos and max_pos > 131_072:
|
| 358 |
+
candidates.append(max_pos)
|
| 359 |
+
if max_pos:
|
| 360 |
+
candidates = [c for c in candidates if c <= max_pos]
|
| 361 |
+
return candidates
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
def _generate_command(
|
| 365 |
+
engine: str,
|
| 366 |
+
model_id: str,
|
| 367 |
+
profile: ArchitectureProfile,
|
| 368 |
+
tp: int,
|
| 369 |
+
entry: EngineCompatEntry | None,
|
| 370 |
+
max_model_len: int | None,
|
| 371 |
+
) -> str:
|
| 372 |
+
engine_norm = engine.lower().strip()
|
| 373 |
+
if engine_norm == "sglang":
|
| 374 |
+
return generate_sglang_command(model_id, profile, tp, entry, max_model_len=max_model_len)
|
| 375 |
+
return generate_vllm_command(model_id, profile, tp, entry, max_model_len=max_model_len)
|
src/llm_cal/core/explain.py
ADDED
|
@@ -0,0 +1,504 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Full derivation traces for each non-trivial number in the report.
|
| 2 |
+
|
| 3 |
+
This module is only invoked when the user passes `--explain`. It doesn't
|
| 4 |
+
recompute anything — it reads the values that the main evaluator already
|
| 5 |
+
produced and wraps them in a formatted explanation with formula, inputs,
|
| 6 |
+
step-by-step computation, and primary source citation.
|
| 7 |
+
|
| 8 |
+
Design rationale: the tool's core promise is deterministic, auditable
|
| 9 |
+
output. `--explain` makes that auditability human-readable. A user can:
|
| 10 |
+
1. Read the explanation themselves
|
| 11 |
+
2. Paste it into an LLM and ask "does this math check out?"
|
| 12 |
+
3. Cross-reference docs/methodology.md for the primary source
|
| 13 |
+
All three preserve determinism — the LLM is the user's tool, not ours.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
from __future__ import annotations
|
| 17 |
+
|
| 18 |
+
import math
|
| 19 |
+
from dataclasses import dataclass, field
|
| 20 |
+
|
| 21 |
+
from llm_cal.core.evaluator import EvaluationReport
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@dataclass(frozen=True)
|
| 25 |
+
class ExplainInput:
|
| 26 |
+
"""One input variable to a formula."""
|
| 27 |
+
|
| 28 |
+
name: str
|
| 29 |
+
value: str # pre-formatted for display
|
| 30 |
+
label: str # e.g. "[verified]", "[estimated]"
|
| 31 |
+
note: str = "" # optional disambiguation
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
@dataclass(frozen=True)
|
| 35 |
+
class ExplainEntry:
|
| 36 |
+
"""A full derivation trace for one output number."""
|
| 37 |
+
|
| 38 |
+
heading: str # localized section title, e.g. "KV cache @ 128K"
|
| 39 |
+
formula: str # the formula, literally
|
| 40 |
+
inputs: list[ExplainInput] = field(default_factory=list)
|
| 41 |
+
steps: list[str] = field(default_factory=list) # step-by-step computation
|
| 42 |
+
result: str = "" # final formatted answer with label
|
| 43 |
+
source: str = "" # primary source citation
|
| 44 |
+
methodology_anchor: str = "" # anchor in docs/methodology.md, e.g. "#prefill-latency"
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def build(report: EvaluationReport) -> list[ExplainEntry]:
|
| 48 |
+
"""Produce explanation entries in the order they appear in the main report."""
|
| 49 |
+
entries: list[ExplainEntry] = []
|
| 50 |
+
|
| 51 |
+
_weight_bytes(report, entries)
|
| 52 |
+
_quantization(report, entries)
|
| 53 |
+
_kv_cache_contexts(report, entries)
|
| 54 |
+
_fleet_tiers(report, entries)
|
| 55 |
+
_prefill(report, entries)
|
| 56 |
+
_decode(report, entries)
|
| 57 |
+
_concurrency(report, entries)
|
| 58 |
+
|
| 59 |
+
return entries
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# ======================================================================
|
| 63 |
+
# Weight
|
| 64 |
+
# ======================================================================
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def _weight_bytes(report: EvaluationReport, entries: list[ExplainEntry]) -> None:
|
| 68 |
+
w = report.weight.total_bytes
|
| 69 |
+
entries.append(
|
| 70 |
+
ExplainEntry(
|
| 71 |
+
heading="Weight bytes (safetensors file sum)",
|
| 72 |
+
formula="sum(sibling.size for sibling in HF model_info(files_metadata=True).siblings if sibling.endswith('.safetensors'))",
|
| 73 |
+
inputs=[
|
| 74 |
+
ExplainInput(
|
| 75 |
+
name="HF model_info API",
|
| 76 |
+
value=f"source={report.source}, sha={report.commit_sha or 'HEAD'}",
|
| 77 |
+
label="[verified]",
|
| 78 |
+
),
|
| 79 |
+
],
|
| 80 |
+
steps=[
|
| 81 |
+
f"Raw value from API = {w.value:,} bytes",
|
| 82 |
+
f"= {w.value / 1e9:.2f} GB",
|
| 83 |
+
],
|
| 84 |
+
result=f"{w.value:,} bytes [verified]",
|
| 85 |
+
source=w.source or "HF siblings API",
|
| 86 |
+
methodology_anchor="#weight-bytes",
|
| 87 |
+
)
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def _quantization(report: EvaluationReport, entries: list[ExplainEntry]) -> None:
|
| 92 |
+
r = report.reconciliation
|
| 93 |
+
if not r.candidates:
|
| 94 |
+
return
|
| 95 |
+
best = r.candidates[0]
|
| 96 |
+
cands_table = "\n".join(
|
| 97 |
+
f" {c.scheme:<16} predicted={c.predicted_bytes / 1e9:.2f} GB "
|
| 98 |
+
f"error={c.relative_error * 100:.1f}%"
|
| 99 |
+
for c in r.candidates[:6]
|
| 100 |
+
)
|
| 101 |
+
entries.append(
|
| 102 |
+
ExplainEntry(
|
| 103 |
+
heading="Quantization scheme (reconciliation)",
|
| 104 |
+
formula="best_match = argmin_scheme |observed_bytes - scheme.bpp × total_params|",
|
| 105 |
+
inputs=[
|
| 106 |
+
ExplainInput(
|
| 107 |
+
name="observed_bytes",
|
| 108 |
+
value=f"{r.observed_bytes:,}",
|
| 109 |
+
label="[verified]",
|
| 110 |
+
),
|
| 111 |
+
ExplainInput(
|
| 112 |
+
name="total_params",
|
| 113 |
+
value=f"{r.total_params:,}",
|
| 114 |
+
label="[estimated]",
|
| 115 |
+
note="from architecture formula — see '#params-estimate' entry below",
|
| 116 |
+
),
|
| 117 |
+
],
|
| 118 |
+
steps=[
|
| 119 |
+
"For each known quantization scheme, predict total bytes = bpp × params:",
|
| 120 |
+
cands_table,
|
| 121 |
+
f"Winner: {best.scheme} at {best.relative_error * 100:.1f}% error",
|
| 122 |
+
],
|
| 123 |
+
result=f"{r.best.value} [{r.best.label.value}]",
|
| 124 |
+
source="Nearest-anchor match against known bytes-per-param values",
|
| 125 |
+
methodology_anchor="#quantization-scheme",
|
| 126 |
+
)
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
# ======================================================================
|
| 131 |
+
# KV cache
|
| 132 |
+
# ======================================================================
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def _kv_cache_contexts(report: EvaluationReport, entries: list[ExplainEntry]) -> None:
|
| 136 |
+
profile = report.profile
|
| 137 |
+
attn = profile.attention
|
| 138 |
+
if attn is None:
|
| 139 |
+
return
|
| 140 |
+
|
| 141 |
+
is_mla = attn.variant == "MLA"
|
| 142 |
+
is_csa_hca = attn.variant == "CSA_HCA"
|
| 143 |
+
|
| 144 |
+
for ctx, av in report.kv_cache_by_context.items():
|
| 145 |
+
if av.value == 0:
|
| 146 |
+
continue
|
| 147 |
+
# Rebuild the computation for transparency
|
| 148 |
+
if is_mla and attn.kv_lora_rank:
|
| 149 |
+
per_tok_per_layer = attn.kv_lora_rank * 2 # kv_lora_rank × dtype(2)
|
| 150 |
+
formula = "per_tok_per_layer = kv_lora_rank × dtype_bytes (MLA: compressed latent KV)"
|
| 151 |
+
inputs = [
|
| 152 |
+
ExplainInput("kv_lora_rank", str(attn.kv_lora_rank), "[verified]"),
|
| 153 |
+
ExplainInput("dtype_bytes", "2", "[verified]", note="BF16/FP16"),
|
| 154 |
+
ExplainInput("seq_len", f"{ctx:,}", "[verified]"),
|
| 155 |
+
ExplainInput("num_layers", str(profile.num_hidden_layers), "[verified]"),
|
| 156 |
+
]
|
| 157 |
+
else:
|
| 158 |
+
per_tok_per_layer = 2 * attn.num_kv_heads * attn.head_dim * 2
|
| 159 |
+
formula = "per_tok_per_layer = 2 × num_kv_heads × head_dim × dtype_bytes (standard attention)"
|
| 160 |
+
inputs = [
|
| 161 |
+
ExplainInput("num_kv_heads", str(attn.num_kv_heads), "[verified]"),
|
| 162 |
+
ExplainInput("head_dim", str(attn.head_dim), "[verified]"),
|
| 163 |
+
ExplainInput("dtype_bytes", "2", "[verified]", note="BF16/FP16"),
|
| 164 |
+
ExplainInput("seq_len", f"{ctx:,}", "[verified]"),
|
| 165 |
+
ExplainInput("num_layers", str(profile.num_hidden_layers), "[verified]"),
|
| 166 |
+
]
|
| 167 |
+
|
| 168 |
+
baseline = per_tok_per_layer * ctx * profile.num_hidden_layers
|
| 169 |
+
steps = [
|
| 170 |
+
f"per_tok_per_layer = {per_tok_per_layer:,} bytes",
|
| 171 |
+
f"baseline = per_tok_per_layer × seq_len × num_layers = {baseline:,} bytes",
|
| 172 |
+
]
|
| 173 |
+
|
| 174 |
+
if is_csa_hca and attn.compress_ratios:
|
| 175 |
+
ratios = attn.compress_ratios
|
| 176 |
+
avg = sum(1.0 if r == 0 else 1.0 / r for r in ratios) / len(ratios)
|
| 177 |
+
inputs.append(
|
| 178 |
+
ExplainInput(
|
| 179 |
+
"compress_ratios",
|
| 180 |
+
f"len={len(ratios)} (avg keep-fraction={avg:.4f})",
|
| 181 |
+
"[verified]",
|
| 182 |
+
)
|
| 183 |
+
)
|
| 184 |
+
formula += (
|
| 185 |
+
"\napply_csa_hca: baseline × avg(1/r_i for r_i in compress_ratios, 0 = keep-all=1)"
|
| 186 |
+
)
|
| 187 |
+
steps.extend(
|
| 188 |
+
[
|
| 189 |
+
f"avg_keep_fraction = {avg:.4f}",
|
| 190 |
+
f"result = baseline × avg_keep_fraction = {av.value:,} bytes",
|
| 191 |
+
]
|
| 192 |
+
)
|
| 193 |
+
else:
|
| 194 |
+
steps.append(f"result = baseline = {av.value:,} bytes")
|
| 195 |
+
|
| 196 |
+
entries.append(
|
| 197 |
+
ExplainEntry(
|
| 198 |
+
heading=f"KV cache @ {_fmt_ctx(ctx)} context",
|
| 199 |
+
formula=formula,
|
| 200 |
+
inputs=inputs,
|
| 201 |
+
steps=steps,
|
| 202 |
+
result=f"{av.value:,} bytes = {av.value / 1e9:.2f} GB [{av.label.value}]",
|
| 203 |
+
source=(
|
| 204 |
+
"DeepSeek-V2 paper (MLA); DeepSeek-V4 tech report (CSA+HCA); "
|
| 205 |
+
"standard attention formula per Attention Is All You Need (Vaswani 2017)"
|
| 206 |
+
),
|
| 207 |
+
methodology_anchor="#kv-cache-per-request",
|
| 208 |
+
)
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
# ======================================================================
|
| 213 |
+
# Fleet tiers
|
| 214 |
+
# ======================================================================
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def _fleet_tiers(report: EvaluationReport, entries: list[ExplainEntry]) -> None:
|
| 218 |
+
if report.fleet is None or report.gpu_spec is None:
|
| 219 |
+
return
|
| 220 |
+
|
| 221 |
+
# One explain block per tier (min / dev / prod)
|
| 222 |
+
for opt in report.fleet.options:
|
| 223 |
+
tier_label = opt.tier
|
| 224 |
+
headroom = opt.usable_bytes_per_gpu - opt.weight_bytes_per_gpu
|
| 225 |
+
steps = [
|
| 226 |
+
f"per-GPU HBM usable (@ 90% util) = {opt.usable_bytes_per_gpu:,} bytes",
|
| 227 |
+
f"weight per GPU = total_weight / TP_size = "
|
| 228 |
+
f"{report.weight.total_bytes.value:,} / {opt.gpu_count} = "
|
| 229 |
+
f"{opt.weight_bytes_per_gpu:,} bytes",
|
| 230 |
+
f"headroom per GPU = usable - weight = {headroom:,} bytes ({headroom / 1e9:.2f} GB)",
|
| 231 |
+
]
|
| 232 |
+
fit_criterion = {"min": 1, "dev": 8, "prod": 16}.get(tier_label, 1)
|
| 233 |
+
steps.append(
|
| 234 |
+
f"tier criterion: headroom ≥ weight_per_gpu + {fit_criterion} × kv_per_request_128K"
|
| 235 |
+
)
|
| 236 |
+
steps.append(
|
| 237 |
+
f"smallest TP count in {list(report.fleet.valid_tp_sizes)} that "
|
| 238 |
+
f"satisfies the criterion: {opt.gpu_count}"
|
| 239 |
+
)
|
| 240 |
+
if not opt.fits:
|
| 241 |
+
steps.append(
|
| 242 |
+
f"NOTE: does not fit the criterion — the chosen {opt.gpu_count} "
|
| 243 |
+
"is the best available."
|
| 244 |
+
)
|
| 245 |
+
|
| 246 |
+
entries.append(
|
| 247 |
+
ExplainEntry(
|
| 248 |
+
heading=f"Fleet tier: {tier_label} ({opt.gpu_count} GPUs)",
|
| 249 |
+
formula=(
|
| 250 |
+
"smallest TP in valid_set where "
|
| 251 |
+
"weight_per_gpu + concurrent × kv_per_request ≤ usable_per_gpu"
|
| 252 |
+
),
|
| 253 |
+
inputs=[
|
| 254 |
+
ExplainInput(
|
| 255 |
+
"total_weight_bytes",
|
| 256 |
+
f"{report.weight.total_bytes.value:,}",
|
| 257 |
+
"[verified]",
|
| 258 |
+
),
|
| 259 |
+
ExplainInput(
|
| 260 |
+
"valid_TP_sizes",
|
| 261 |
+
str(list(report.fleet.valid_tp_sizes)),
|
| 262 |
+
"[estimated]",
|
| 263 |
+
note="divisors of num_attention_heads capped at 8 (single node)",
|
| 264 |
+
),
|
| 265 |
+
ExplainInput(
|
| 266 |
+
"GPU memory_gb",
|
| 267 |
+
f"{report.gpu_spec.memory_gb} GB",
|
| 268 |
+
"[verified]",
|
| 269 |
+
),
|
| 270 |
+
],
|
| 271 |
+
steps=steps,
|
| 272 |
+
result=f"{opt.gpu_count} GPUs, fit={opt.fits}",
|
| 273 |
+
source="vLLM --gpu-memory-utilization 0.9 convention; TP divisibility required by vLLM/SGLang",
|
| 274 |
+
methodology_anchor="#tp-aware-kv-sharding",
|
| 275 |
+
)
|
| 276 |
+
)
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
# ======================================================================
|
| 280 |
+
# Prefill
|
| 281 |
+
# ======================================================================
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
def _prefill(report: EvaluationReport, entries: list[ExplainEntry]) -> None:
|
| 285 |
+
if (
|
| 286 |
+
report.prefill is None
|
| 287 |
+
or report.gpu_spec is None
|
| 288 |
+
or report.fleet is None
|
| 289 |
+
or report.perf_input_tokens is None
|
| 290 |
+
):
|
| 291 |
+
return
|
| 292 |
+
p = report.prefill
|
| 293 |
+
# Figure out chosen GPU count from the fleet
|
| 294 |
+
chosen = next(
|
| 295 |
+
(o.gpu_count for o in report.fleet.options if o.tier == report.fleet.best_tier),
|
| 296 |
+
report.fleet.options[0].gpu_count,
|
| 297 |
+
)
|
| 298 |
+
entries.append(
|
| 299 |
+
ExplainEntry(
|
| 300 |
+
heading="Prefill latency (single request)",
|
| 301 |
+
formula=(
|
| 302 |
+
"FLOPs = 2 × params × input_tokens\n"
|
| 303 |
+
"effective_TFLOPS = peak_fp16_TFLOPS × num_gpus × utilization\n"
|
| 304 |
+
"latency_ms = (FLOPs / (effective_TFLOPS × 1e12)) × 1000"
|
| 305 |
+
),
|
| 306 |
+
inputs=[
|
| 307 |
+
ExplainInput(
|
| 308 |
+
"params",
|
| 309 |
+
f"{report.total_params_estimate.value:,}",
|
| 310 |
+
"[estimated]",
|
| 311 |
+
note="from architecture formula (see weight.py)",
|
| 312 |
+
),
|
| 313 |
+
ExplainInput("input_tokens", f"{report.perf_input_tokens:,}", "[user-set]"),
|
| 314 |
+
ExplainInput(
|
| 315 |
+
"peak_fp16_TFLOPS",
|
| 316 |
+
f"{report.gpu_spec.fp16_tflops}",
|
| 317 |
+
"[verified]",
|
| 318 |
+
note=f"from GPU database, {report.gpu_spec.id} spec",
|
| 319 |
+
),
|
| 320 |
+
ExplainInput("num_gpus", f"{chosen}", "[estimated]"),
|
| 321 |
+
ExplainInput(
|
| 322 |
+
"utilization",
|
| 323 |
+
f"{p.utilization:.2f}",
|
| 324 |
+
"[user-set]",
|
| 325 |
+
note="empirical MFU, default 0.40 — override with --prefill-util",
|
| 326 |
+
),
|
| 327 |
+
],
|
| 328 |
+
steps=[
|
| 329 |
+
f"FLOPs = 2 × {report.total_params_estimate.value:,} × "
|
| 330 |
+
f"{report.perf_input_tokens:,} = {p.total_flops.value:.3e}",
|
| 331 |
+
f"effective_TFLOPS = {report.gpu_spec.fp16_tflops} × {chosen} × "
|
| 332 |
+
f"{p.utilization:.2f} = {p.peak_effective_tflops.value:.1f}",
|
| 333 |
+
f"latency = {p.total_flops.value:.3e} / "
|
| 334 |
+
f"({p.peak_effective_tflops.value:.1f} × 1e12) × 1000 = "
|
| 335 |
+
f"{p.latency_ms.value:.1f} ms",
|
| 336 |
+
],
|
| 337 |
+
result=f"{p.latency_ms.value:.1f} ms [{p.latency_ms.label.value}]",
|
| 338 |
+
source="Kaplan et al. 2020 'Scaling Laws for Neural Language Models' (arxiv.org/abs/2001.08361)",
|
| 339 |
+
methodology_anchor="#prefill-latency",
|
| 340 |
+
)
|
| 341 |
+
)
|
| 342 |
+
|
| 343 |
+
|
| 344 |
+
# ======================================================================
|
| 345 |
+
# Decode
|
| 346 |
+
# ======================================================================
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
def _decode(report: EvaluationReport, entries: list[ExplainEntry]) -> None:
|
| 350 |
+
if report.decode is None or report.gpu_spec is None or report.fleet is None:
|
| 351 |
+
return
|
| 352 |
+
d = report.decode
|
| 353 |
+
bw = report.gpu_spec.memory_bandwidth_gbps or 0
|
| 354 |
+
chosen = next(
|
| 355 |
+
(o.gpu_count for o in report.fleet.options if o.tier == report.fleet.best_tier),
|
| 356 |
+
report.fleet.options[0].gpu_count,
|
| 357 |
+
)
|
| 358 |
+
weight_per_gpu = d.active_weight_bytes_per_gpu.value
|
| 359 |
+
effective_bw_gbs = bw * d.bw_utilization
|
| 360 |
+
steps = [
|
| 361 |
+
f"weight_per_gpu = {report.weight.total_bytes.value:,} / {chosen} = "
|
| 362 |
+
f"{weight_per_gpu:,} bytes ({weight_per_gpu / 1e9:.2f} GB)",
|
| 363 |
+
f"effective_bw = {bw} × {d.bw_utilization:.2f} = {effective_bw_gbs:.0f} GB/s",
|
| 364 |
+
f"per_gpu_tok_per_sec = effective_bw / weight_per_gpu = "
|
| 365 |
+
f"{effective_bw_gbs * 1e9 / weight_per_gpu:.1f} tok/s",
|
| 366 |
+
f"cluster_tok_per_sec = per_gpu × {chosen} × "
|
| 367 |
+
f"{d.cluster_comm_efficiency:.2f} = {d.cluster_tokens_per_sec.value:.1f} tok/s",
|
| 368 |
+
]
|
| 369 |
+
entries.append(
|
| 370 |
+
ExplainEntry(
|
| 371 |
+
heading="Decode throughput (cluster)",
|
| 372 |
+
formula=(
|
| 373 |
+
"per_gpu_tok_per_sec = memory_bandwidth × bw_util / weight_bytes_per_gpu\n"
|
| 374 |
+
"cluster_tok_per_sec = per_gpu × num_gpus × cluster_comm_efficiency"
|
| 375 |
+
),
|
| 376 |
+
inputs=[
|
| 377 |
+
ExplainInput(
|
| 378 |
+
"GPU memory_bandwidth_gbps",
|
| 379 |
+
f"{bw}",
|
| 380 |
+
"[verified]",
|
| 381 |
+
note=f"from GPU database, {report.gpu_spec.id}",
|
| 382 |
+
),
|
| 383 |
+
ExplainInput(
|
| 384 |
+
"bw_util",
|
| 385 |
+
f"{d.bw_utilization:.2f}",
|
| 386 |
+
"[user-set]",
|
| 387 |
+
note="empirical, default 0.50 — override with --decode-bw-util",
|
| 388 |
+
),
|
| 389 |
+
ExplainInput("weight_bytes_per_gpu", f"{weight_per_gpu:,}", "[estimated]"),
|
| 390 |
+
ExplainInput("num_gpus", f"{chosen}", "[estimated]"),
|
| 391 |
+
ExplainInput(
|
| 392 |
+
"cluster_comm_efficiency",
|
| 393 |
+
f"{d.cluster_comm_efficiency:.2f}",
|
| 394 |
+
"[user-set]",
|
| 395 |
+
note="NCCL AllReduce efficiency on NVLink, default 0.90",
|
| 396 |
+
),
|
| 397 |
+
],
|
| 398 |
+
steps=steps,
|
| 399 |
+
result=f"{d.cluster_tokens_per_sec.value:.1f} tok/s [estimated]",
|
| 400 |
+
source="vLLM paper (Kwon et al. SOSP 2023, arxiv.org/abs/2309.06180)",
|
| 401 |
+
methodology_anchor="#decode-tokens-per-second",
|
| 402 |
+
)
|
| 403 |
+
)
|
| 404 |
+
|
| 405 |
+
|
| 406 |
+
# ======================================================================
|
| 407 |
+
# Concurrency bounds
|
| 408 |
+
# ======================================================================
|
| 409 |
+
|
| 410 |
+
|
| 411 |
+
def _concurrency(report: EvaluationReport, entries: list[ExplainEntry]) -> None:
|
| 412 |
+
if report.concurrency is None:
|
| 413 |
+
return
|
| 414 |
+
c = report.concurrency
|
| 415 |
+
entries.append(
|
| 416 |
+
ExplainEntry(
|
| 417 |
+
heading="K bound (memory capacity)",
|
| 418 |
+
formula="K = floor(per_GPU_headroom_bytes / per_GPU_kv_bytes_per_request)",
|
| 419 |
+
inputs=[
|
| 420 |
+
ExplainInput(
|
| 421 |
+
"per_GPU_headroom_bytes",
|
| 422 |
+
f"{c.k_source_headroom_bytes:,}",
|
| 423 |
+
"[estimated]",
|
| 424 |
+
),
|
| 425 |
+
ExplainInput(
|
| 426 |
+
"per_GPU_kv_bytes_per_request",
|
| 427 |
+
f"{c.k_source_kv_per_req_bytes:,}",
|
| 428 |
+
"[estimated]",
|
| 429 |
+
note="post-TP-sharding via min(tp, num_kv_heads)",
|
| 430 |
+
),
|
| 431 |
+
],
|
| 432 |
+
steps=[
|
| 433 |
+
f"K = floor({c.k_source_headroom_bytes:,} / "
|
| 434 |
+
f"{c.k_source_kv_per_req_bytes:,}) = {c.k_bound.value}",
|
| 435 |
+
],
|
| 436 |
+
result=f"K = {c.k_bound.value} [{c.k_bound.label.value}]",
|
| 437 |
+
source="TP sharding rule from vLLM source code (verified)",
|
| 438 |
+
methodology_anchor="#k-bound-memory-capacity",
|
| 439 |
+
)
|
| 440 |
+
)
|
| 441 |
+
l_tps = report.decode.cluster_tokens_per_sec.value if report.decode else 0
|
| 442 |
+
entries.append(
|
| 443 |
+
ExplainEntry(
|
| 444 |
+
heading="L bound (compute/bandwidth at SLA)",
|
| 445 |
+
formula=(
|
| 446 |
+
"L = floor(cluster_tok_per_sec / target_per_user_tok_per_sec / degradation_factor)"
|
| 447 |
+
),
|
| 448 |
+
inputs=[
|
| 449 |
+
ExplainInput("cluster_tok_per_sec", f"{l_tps:.1f}", "[estimated]"),
|
| 450 |
+
ExplainInput(
|
| 451 |
+
"target_per_user_tok_per_sec",
|
| 452 |
+
f"{c.target_tokens_per_sec:.1f}",
|
| 453 |
+
"[user-set]",
|
| 454 |
+
note="SLA, override with --target-tokens-per-sec",
|
| 455 |
+
),
|
| 456 |
+
ExplainInput(
|
| 457 |
+
"degradation_factor",
|
| 458 |
+
f"{c.degradation_factor:.2f}",
|
| 459 |
+
"[user-set]",
|
| 460 |
+
note="default 1.0 = no degradation; override with --concurrency-degradation",
|
| 461 |
+
),
|
| 462 |
+
],
|
| 463 |
+
steps=[
|
| 464 |
+
f"L = floor({l_tps:.1f} / {c.target_tokens_per_sec:.1f} / "
|
| 465 |
+
f"{c.degradation_factor:.2f}) = {c.l_bound.value}",
|
| 466 |
+
],
|
| 467 |
+
result=f"L = {c.l_bound.value} [{c.l_bound.label.value}]",
|
| 468 |
+
source="Standard SLA-based capacity planning",
|
| 469 |
+
methodology_anchor="#l-bound-compute-bandwidth-at-sla",
|
| 470 |
+
)
|
| 471 |
+
)
|
| 472 |
+
entries.append(
|
| 473 |
+
ExplainEntry(
|
| 474 |
+
heading="Max concurrent + bottleneck verdict",
|
| 475 |
+
formula="max_concurrent = min(K, L); bottleneck = 'memory_capacity' if K ≤ L else 'memory_bandwidth / compute'",
|
| 476 |
+
inputs=[
|
| 477 |
+
ExplainInput("K", str(c.k_bound.value), f"[{c.k_bound.label.value}]"),
|
| 478 |
+
ExplainInput("L", str(c.l_bound.value), f"[{c.l_bound.label.value}]"),
|
| 479 |
+
],
|
| 480 |
+
steps=[
|
| 481 |
+
f"max_concurrent = min(K={c.k_bound.value}, L={c.l_bound.value}) = "
|
| 482 |
+
f"{c.max_concurrent.value}",
|
| 483 |
+
f"bottleneck = {c.bottleneck}",
|
| 484 |
+
],
|
| 485 |
+
result=(f"{c.max_concurrent.value} concurrent, bottleneck = {c.bottleneck}"),
|
| 486 |
+
source=c.bottleneck_reason_en,
|
| 487 |
+
methodology_anchor="#concurrency-bounds-k-l",
|
| 488 |
+
)
|
| 489 |
+
)
|
| 490 |
+
# Sanity check to silence "unused math import" if no steps triggered math.
|
| 491 |
+
_ = math.floor(0)
|
| 492 |
+
|
| 493 |
+
|
| 494 |
+
# ======================================================================
|
| 495 |
+
# Helpers
|
| 496 |
+
# ======================================================================
|
| 497 |
+
|
| 498 |
+
|
| 499 |
+
def _fmt_ctx(ctx: int) -> str:
|
| 500 |
+
if ctx >= 1_000_000:
|
| 501 |
+
return f"{ctx // 1_000_000}M"
|
| 502 |
+
if ctx >= 1024:
|
| 503 |
+
return f"{ctx // 1024}K"
|
| 504 |
+
return str(ctx)
|
src/llm_cal/engine_compat/__init__.py
ADDED
|
File without changes
|
src/llm_cal/engine_compat/loader.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Engine compatibility matrix loader + match function."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from functools import lru_cache
|
| 6 |
+
from importlib.resources import files
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Literal
|
| 9 |
+
|
| 10 |
+
from packaging.specifiers import InvalidSpecifier, SpecifierSet
|
| 11 |
+
from packaging.version import InvalidVersion, Version
|
| 12 |
+
from pydantic import BaseModel, Field
|
| 13 |
+
|
| 14 |
+
from llm_cal.common.yaml_loader import load_yaml
|
| 15 |
+
|
| 16 |
+
SupportLevel = Literal["full", "partial", "broken", "unverified"]
|
| 17 |
+
VerificationLevel = Literal["verified", "cited", "unverified"]
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class EngineFlag(BaseModel):
|
| 21 |
+
flag: str
|
| 22 |
+
value: str | None = None
|
| 23 |
+
note_en: str | None = None
|
| 24 |
+
note_zh: str | None = None
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class EngineSource(BaseModel):
|
| 28 |
+
type: str # release_notes | announcement | pr | tested
|
| 29 |
+
url: str | None = None
|
| 30 |
+
captured_date: str | None = None
|
| 31 |
+
note_en: str | None = None
|
| 32 |
+
note_zh: str | None = None
|
| 33 |
+
# `tested` specific fields (may be absent on other types)
|
| 34 |
+
tester: str | None = None
|
| 35 |
+
date: str | None = None
|
| 36 |
+
hardware: str | None = None
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class EngineCompatEntry(BaseModel):
|
| 40 |
+
engine: Literal["vllm", "sglang"]
|
| 41 |
+
version_spec: str # e.g. ">=0.19.0"
|
| 42 |
+
matches_model_type: str
|
| 43 |
+
support: SupportLevel
|
| 44 |
+
verification_level: VerificationLevel
|
| 45 |
+
required_flags: list[EngineFlag] = Field(default_factory=list)
|
| 46 |
+
optional_flags: list[EngineFlag] = Field(default_factory=list)
|
| 47 |
+
sources: list[EngineSource] = Field(default_factory=list)
|
| 48 |
+
caveats_en: list[str] = Field(default_factory=list)
|
| 49 |
+
caveats_zh: list[str] = Field(default_factory=list)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class EngineCompatMatrix(BaseModel):
|
| 53 |
+
schema_version: int
|
| 54 |
+
entries: list[EngineCompatEntry]
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def _default_path() -> Path:
|
| 58 |
+
return Path(str(files("llm_cal.engine_compat").joinpath("matrix.yaml")))
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
@lru_cache(maxsize=1)
|
| 62 |
+
def load_matrix(path: Path | None = None) -> EngineCompatMatrix:
|
| 63 |
+
return load_yaml(path or _default_path(), EngineCompatMatrix)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def find_match(
|
| 67 |
+
engine: str,
|
| 68 |
+
model_type: str,
|
| 69 |
+
version: str | None = None,
|
| 70 |
+
matrix: EngineCompatMatrix | None = None,
|
| 71 |
+
) -> EngineCompatEntry | None:
|
| 72 |
+
"""Find the highest-version matching entry for (engine, model_type).
|
| 73 |
+
|
| 74 |
+
If `version` is None, we return the broadest entry (any version matching
|
| 75 |
+
model_type on the given engine). If `version` is given, we filter to entries
|
| 76 |
+
whose version_spec covers it.
|
| 77 |
+
"""
|
| 78 |
+
m = matrix or load_matrix()
|
| 79 |
+
engine_norm = engine.lower().strip()
|
| 80 |
+
model_type_norm = model_type.lower().strip()
|
| 81 |
+
|
| 82 |
+
candidates = [
|
| 83 |
+
e for e in m.entries if e.engine == engine_norm and e.matches_model_type == model_type_norm
|
| 84 |
+
]
|
| 85 |
+
if not candidates:
|
| 86 |
+
return None
|
| 87 |
+
|
| 88 |
+
if version is None:
|
| 89 |
+
# Return the entry with the "highest lower bound" as the most relevant
|
| 90 |
+
return max(candidates, key=_lower_bound_key)
|
| 91 |
+
|
| 92 |
+
try:
|
| 93 |
+
v = Version(version)
|
| 94 |
+
except InvalidVersion:
|
| 95 |
+
return candidates[0]
|
| 96 |
+
|
| 97 |
+
for entry in candidates:
|
| 98 |
+
try:
|
| 99 |
+
if v in SpecifierSet(entry.version_spec):
|
| 100 |
+
return entry
|
| 101 |
+
except InvalidSpecifier:
|
| 102 |
+
continue
|
| 103 |
+
return None
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def _lower_bound_key(entry: EngineCompatEntry) -> Version:
|
| 107 |
+
"""Extract the lowest version a spec matches (approximate, used only for sort)."""
|
| 108 |
+
try:
|
| 109 |
+
spec = SpecifierSet(entry.version_spec)
|
| 110 |
+
except InvalidSpecifier:
|
| 111 |
+
return Version("0.0.0")
|
| 112 |
+
for single in spec:
|
| 113 |
+
if single.operator in (">=", "==", ">"):
|
| 114 |
+
try:
|
| 115 |
+
return Version(single.version)
|
| 116 |
+
except InvalidVersion:
|
| 117 |
+
continue
|
| 118 |
+
return Version("0.0.0")
|
src/llm_cal/engine_compat/matrix.yaml
ADDED
|
@@ -0,0 +1,512 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Engine compatibility matrix — v0.1 initial entries.
|
| 2 |
+
#
|
| 3 |
+
# VERIFICATION LEVELS:
|
| 4 |
+
# verified = actually tested by someone with hardware (requires type=tested source)
|
| 5 |
+
# >>> v0.1 has ZERO `verified` entries — author has no test hardware <<<
|
| 6 |
+
# cited = evidence exists (release note / PR / announcement) but not tested by us
|
| 7 |
+
# unverified = no sources, just an educated guess
|
| 8 |
+
#
|
| 9 |
+
# The tool ALWAYS surfaces verification_level in output. Users never see a green
|
| 10 |
+
# checkmark on an unverified row.
|
| 11 |
+
schema_version: 2
|
| 12 |
+
entries:
|
| 13 |
+
# ============================================================
|
| 14 |
+
# vLLM
|
| 15 |
+
# ============================================================
|
| 16 |
+
- engine: vllm
|
| 17 |
+
version_spec: ">=0.19.0"
|
| 18 |
+
matches_model_type: deepseek_v4
|
| 19 |
+
support: full
|
| 20 |
+
verification_level: cited
|
| 21 |
+
required_flags: []
|
| 22 |
+
optional_flags:
|
| 23 |
+
- flag: "--attention-backend"
|
| 24 |
+
value: "auto"
|
| 25 |
+
note_en: "Picks CSA+HCA backend automatically."
|
| 26 |
+
note_zh: "自动选择 CSA+HCA 注意力后端。"
|
| 27 |
+
sources:
|
| 28 |
+
- type: release_notes
|
| 29 |
+
url: "https://github.com/vllm-project/vllm/releases/tag/v0.19.0"
|
| 30 |
+
captured_date: "2026-04-23"
|
| 31 |
+
- type: announcement
|
| 32 |
+
url: "https://x.com/vllm_project/status/2047520252851105796"
|
| 33 |
+
captured_date: "2026-04-23"
|
| 34 |
+
note_en: "Day-0 support announcement."
|
| 35 |
+
note_zh: "Day-0 支持公告。"
|
| 36 |
+
caveats_en:
|
| 37 |
+
- "H800 MoE all-to-all is bottlenecked by halved NVLink; throughput lower than H100."
|
| 38 |
+
- "1M context requires --max-model-len 1048576 + --gpu-memory-utilization 0.9."
|
| 39 |
+
caveats_zh:
|
| 40 |
+
- "H800 的 MoE all-to-all 受限于减半的 NVLink,吞吐明显低于 H100。"
|
| 41 |
+
- "1M 上下文需要 --max-model-len 1048576 + --gpu-memory-utilization 0.9。"
|
| 42 |
+
|
| 43 |
+
- engine: vllm
|
| 44 |
+
version_spec: ">=0.18.0,<0.19.0"
|
| 45 |
+
matches_model_type: deepseek_v3_2
|
| 46 |
+
support: full
|
| 47 |
+
verification_level: cited
|
| 48 |
+
required_flags:
|
| 49 |
+
- flag: "--attention-backend"
|
| 50 |
+
value: "nsa"
|
| 51 |
+
optional_flags: []
|
| 52 |
+
sources:
|
| 53 |
+
- type: release_notes
|
| 54 |
+
url: "https://docs.vllm.ai/projects/recipes/en/latest/DeepSeek/DeepSeek-V3_2.html"
|
| 55 |
+
captured_date: "2026-04-24"
|
| 56 |
+
caveats_en:
|
| 57 |
+
- "TP=8 padding overhead: 128 attention heads / 8 = 16 per rank but padded to 64."
|
| 58 |
+
caveats_zh:
|
| 59 |
+
- "TP=8 存在 padding 开销:128 个头 / 8 = 16 头/rank,但填充到 64。建议 TP=2 + DP/EP。"
|
| 60 |
+
|
| 61 |
+
- engine: vllm
|
| 62 |
+
version_spec: ">=0.7.0"
|
| 63 |
+
matches_model_type: deepseek_v3
|
| 64 |
+
support: full
|
| 65 |
+
verification_level: cited
|
| 66 |
+
required_flags: []
|
| 67 |
+
optional_flags:
|
| 68 |
+
- flag: "--trust-remote-code"
|
| 69 |
+
value: null
|
| 70 |
+
note_en: "Required for custom DeepSeek modeling code."
|
| 71 |
+
note_zh: "用于加载 DeepSeek 的自定义建模代码。"
|
| 72 |
+
sources:
|
| 73 |
+
- type: release_notes
|
| 74 |
+
url: "https://docs.vllm.ai/projects/recipes/en/latest/DeepSeek/DeepSeek-V3.html"
|
| 75 |
+
captured_date: "2026-04-24"
|
| 76 |
+
caveats_en: []
|
| 77 |
+
caveats_zh: []
|
| 78 |
+
|
| 79 |
+
- engine: vllm
|
| 80 |
+
version_spec: ">=0.6.0"
|
| 81 |
+
matches_model_type: llama
|
| 82 |
+
support: full
|
| 83 |
+
verification_level: cited
|
| 84 |
+
required_flags: []
|
| 85 |
+
optional_flags: []
|
| 86 |
+
sources:
|
| 87 |
+
- type: release_notes
|
| 88 |
+
url: "https://docs.vllm.ai/"
|
| 89 |
+
captured_date: "2026-04-24"
|
| 90 |
+
caveats_en: []
|
| 91 |
+
caveats_zh: []
|
| 92 |
+
|
| 93 |
+
- engine: vllm
|
| 94 |
+
version_spec: ">=0.7.0"
|
| 95 |
+
matches_model_type: qwen3
|
| 96 |
+
support: full
|
| 97 |
+
verification_level: cited
|
| 98 |
+
required_flags: []
|
| 99 |
+
optional_flags: []
|
| 100 |
+
sources:
|
| 101 |
+
- type: release_notes
|
| 102 |
+
url: "https://docs.vllm.ai/"
|
| 103 |
+
captured_date: "2026-04-24"
|
| 104 |
+
caveats_en: []
|
| 105 |
+
caveats_zh: []
|
| 106 |
+
|
| 107 |
+
- engine: vllm
|
| 108 |
+
version_spec: ">=0.7.0"
|
| 109 |
+
matches_model_type: qwen3_moe
|
| 110 |
+
support: full
|
| 111 |
+
verification_level: cited
|
| 112 |
+
required_flags: []
|
| 113 |
+
optional_flags:
|
| 114 |
+
- flag: "--enable-expert-parallel"
|
| 115 |
+
value: null
|
| 116 |
+
note_en: "Enables DP+EP for MoE all-to-all distribution."
|
| 117 |
+
note_zh: "启用 DP+EP,对 MoE all-to-all 通信更友好。"
|
| 118 |
+
sources:
|
| 119 |
+
- type: release_notes
|
| 120 |
+
url: "https://docs.vllm.ai/en/latest/serving/expert_parallel_deployment/"
|
| 121 |
+
captured_date: "2026-04-24"
|
| 122 |
+
caveats_en: []
|
| 123 |
+
caveats_zh: []
|
| 124 |
+
|
| 125 |
+
- engine: vllm
|
| 126 |
+
version_spec: ">=0.6.0"
|
| 127 |
+
matches_model_type: mixtral
|
| 128 |
+
support: full
|
| 129 |
+
verification_level: cited
|
| 130 |
+
required_flags: []
|
| 131 |
+
optional_flags: []
|
| 132 |
+
sources:
|
| 133 |
+
- type: release_notes
|
| 134 |
+
url: "https://docs.vllm.ai/"
|
| 135 |
+
captured_date: "2026-04-24"
|
| 136 |
+
caveats_en: []
|
| 137 |
+
caveats_zh: []
|
| 138 |
+
|
| 139 |
+
- engine: vllm
|
| 140 |
+
version_spec: ">=0.6.0"
|
| 141 |
+
matches_model_type: mistral
|
| 142 |
+
support: full
|
| 143 |
+
verification_level: cited
|
| 144 |
+
required_flags: []
|
| 145 |
+
optional_flags: []
|
| 146 |
+
sources:
|
| 147 |
+
- type: release_notes
|
| 148 |
+
url: "https://docs.vllm.ai/"
|
| 149 |
+
captured_date: "2026-04-24"
|
| 150 |
+
caveats_en: []
|
| 151 |
+
caveats_zh: []
|
| 152 |
+
|
| 153 |
+
- engine: vllm
|
| 154 |
+
version_spec: ">=0.6.0"
|
| 155 |
+
matches_model_type: qwen2
|
| 156 |
+
support: full
|
| 157 |
+
verification_level: cited
|
| 158 |
+
required_flags: []
|
| 159 |
+
optional_flags: []
|
| 160 |
+
sources:
|
| 161 |
+
- type: release_notes
|
| 162 |
+
url: "https://docs.vllm.ai/en/latest/models/supported_models.html"
|
| 163 |
+
captured_date: "2026-04-24"
|
| 164 |
+
caveats_en: []
|
| 165 |
+
caveats_zh: []
|
| 166 |
+
|
| 167 |
+
- engine: vllm
|
| 168 |
+
version_spec: ">=0.6.0"
|
| 169 |
+
matches_model_type: qwen2_moe
|
| 170 |
+
support: full
|
| 171 |
+
verification_level: cited
|
| 172 |
+
required_flags: []
|
| 173 |
+
optional_flags:
|
| 174 |
+
- flag: "--enable-expert-parallel"
|
| 175 |
+
value: null
|
| 176 |
+
note_en: "Enables DP+EP for MoE all-to-all distribution."
|
| 177 |
+
note_zh: "启用 DP+EP,对 MoE all-to-all 通信更友好。"
|
| 178 |
+
sources:
|
| 179 |
+
- type: release_notes
|
| 180 |
+
url: "https://docs.vllm.ai/en/latest/serving/expert_parallel_deployment/"
|
| 181 |
+
captured_date: "2026-04-24"
|
| 182 |
+
caveats_en: []
|
| 183 |
+
caveats_zh: []
|
| 184 |
+
|
| 185 |
+
- engine: vllm
|
| 186 |
+
version_spec: ">=0.5.0"
|
| 187 |
+
matches_model_type: gemma
|
| 188 |
+
support: full
|
| 189 |
+
verification_level: cited
|
| 190 |
+
required_flags: []
|
| 191 |
+
optional_flags: []
|
| 192 |
+
sources:
|
| 193 |
+
- type: release_notes
|
| 194 |
+
url: "https://docs.vllm.ai/en/latest/models/supported_models.html"
|
| 195 |
+
captured_date: "2026-04-24"
|
| 196 |
+
caveats_en:
|
| 197 |
+
- "Gemma uses tied embeddings — output head shares embedding weights."
|
| 198 |
+
caveats_zh:
|
| 199 |
+
- "Gemma 使用权重绑定的 embedding(tied embeddings),输出头与 embedding 共享权重。"
|
| 200 |
+
|
| 201 |
+
- engine: vllm
|
| 202 |
+
version_spec: ">=0.6.0"
|
| 203 |
+
matches_model_type: gemma2
|
| 204 |
+
support: full
|
| 205 |
+
verification_level: cited
|
| 206 |
+
required_flags: []
|
| 207 |
+
optional_flags: []
|
| 208 |
+
sources:
|
| 209 |
+
- type: release_notes
|
| 210 |
+
url: "https://docs.vllm.ai/en/latest/models/supported_models.html"
|
| 211 |
+
captured_date: "2026-04-24"
|
| 212 |
+
caveats_en: []
|
| 213 |
+
caveats_zh: []
|
| 214 |
+
|
| 215 |
+
- engine: vllm
|
| 216 |
+
version_spec: ">=0.7.0"
|
| 217 |
+
matches_model_type: gemma3
|
| 218 |
+
support: full
|
| 219 |
+
verification_level: cited
|
| 220 |
+
required_flags: []
|
| 221 |
+
optional_flags: []
|
| 222 |
+
sources:
|
| 223 |
+
- type: release_notes
|
| 224 |
+
url: "https://docs.vllm.ai/en/latest/models/supported_models.html"
|
| 225 |
+
captured_date: "2026-04-24"
|
| 226 |
+
caveats_en:
|
| 227 |
+
- "Gemma 3 adds vision modality — v0.1 of llm-cal treats it as text-only for now."
|
| 228 |
+
caveats_zh:
|
| 229 |
+
- "Gemma 3 新增视觉多模态能力,llm-cal v0.1 当作纯文本模型处理。"
|
| 230 |
+
|
| 231 |
+
- engine: vllm
|
| 232 |
+
version_spec: ">=0.5.0"
|
| 233 |
+
matches_model_type: phi
|
| 234 |
+
support: full
|
| 235 |
+
verification_level: cited
|
| 236 |
+
required_flags: []
|
| 237 |
+
optional_flags: []
|
| 238 |
+
sources:
|
| 239 |
+
- type: release_notes
|
| 240 |
+
url: "https://docs.vllm.ai/en/latest/models/supported_models.html"
|
| 241 |
+
captured_date: "2026-04-24"
|
| 242 |
+
caveats_en: []
|
| 243 |
+
caveats_zh: []
|
| 244 |
+
|
| 245 |
+
- engine: vllm
|
| 246 |
+
version_spec: ">=0.5.0"
|
| 247 |
+
matches_model_type: phi3
|
| 248 |
+
support: full
|
| 249 |
+
verification_level: cited
|
| 250 |
+
required_flags: []
|
| 251 |
+
optional_flags: []
|
| 252 |
+
sources:
|
| 253 |
+
- type: release_notes
|
| 254 |
+
url: "https://docs.vllm.ai/en/latest/models/supported_models.html"
|
| 255 |
+
captured_date: "2026-04-24"
|
| 256 |
+
caveats_en: []
|
| 257 |
+
caveats_zh: []
|
| 258 |
+
|
| 259 |
+
- engine: vllm
|
| 260 |
+
version_spec: ">=0.6.0"
|
| 261 |
+
matches_model_type: deepseek_v2
|
| 262 |
+
support: full
|
| 263 |
+
verification_level: cited
|
| 264 |
+
required_flags: []
|
| 265 |
+
optional_flags:
|
| 266 |
+
- flag: "--trust-remote-code"
|
| 267 |
+
value: null
|
| 268 |
+
note_en: "Required for DeepSeek V2 custom modeling code."
|
| 269 |
+
note_zh: "加载 DeepSeek V2 的自定义建模代码。"
|
| 270 |
+
sources:
|
| 271 |
+
- type: release_notes
|
| 272 |
+
url: "https://docs.vllm.ai/en/latest/models/supported_models.html"
|
| 273 |
+
captured_date: "2026-04-24"
|
| 274 |
+
caveats_en: []
|
| 275 |
+
caveats_zh: []
|
| 276 |
+
|
| 277 |
+
# ============================================================
|
| 278 |
+
# SGLang
|
| 279 |
+
# ============================================================
|
| 280 |
+
- engine: sglang
|
| 281 |
+
version_spec: ">=0.5.0"
|
| 282 |
+
matches_model_type: deepseek_v3_2
|
| 283 |
+
support: full
|
| 284 |
+
verification_level: cited
|
| 285 |
+
required_flags:
|
| 286 |
+
- flag: "--attention-backend"
|
| 287 |
+
value: "nsa"
|
| 288 |
+
optional_flags: []
|
| 289 |
+
sources:
|
| 290 |
+
- type: release_notes
|
| 291 |
+
url: "https://docs.sglang.io/advanced_features/attention_backend.html"
|
| 292 |
+
captured_date: "2026-04-24"
|
| 293 |
+
- type: announcement
|
| 294 |
+
url: "https://www.lmsys.org/blog/2025-09-29-deepseek-V32/"
|
| 295 |
+
captured_date: "2025-09-29"
|
| 296 |
+
note_en: "Day-0 V3.2 support announcement."
|
| 297 |
+
note_zh: "V3.2 的 Day-0 支持公告。"
|
| 298 |
+
caveats_en: []
|
| 299 |
+
caveats_zh: []
|
| 300 |
+
|
| 301 |
+
- engine: sglang
|
| 302 |
+
version_spec: ">=0.5.0"
|
| 303 |
+
matches_model_type: deepseek_v4
|
| 304 |
+
support: unverified
|
| 305 |
+
verification_level: unverified
|
| 306 |
+
required_flags: []
|
| 307 |
+
optional_flags: []
|
| 308 |
+
sources: []
|
| 309 |
+
caveats_en:
|
| 310 |
+
- "As of 2026-04-24, no Day-0 announcement for V4. DSA/NSA infrastructure exists (V3.2), expected to extend."
|
| 311 |
+
caveats_zh:
|
| 312 |
+
- "截至 2026-04-24,尚无 V4 的 Day-0 公告。已有 V3.2 的 DSA/NSA 基础设施,预期会扩展支持。"
|
| 313 |
+
|
| 314 |
+
- engine: sglang
|
| 315 |
+
version_spec: ">=0.4.0"
|
| 316 |
+
matches_model_type: deepseek_v3
|
| 317 |
+
support: full
|
| 318 |
+
verification_level: cited
|
| 319 |
+
required_flags: []
|
| 320 |
+
optional_flags: []
|
| 321 |
+
sources:
|
| 322 |
+
- type: release_notes
|
| 323 |
+
url: "https://github.com/sgl-project/sglang"
|
| 324 |
+
captured_date: "2026-04-24"
|
| 325 |
+
caveats_en: []
|
| 326 |
+
caveats_zh: []
|
| 327 |
+
|
| 328 |
+
- engine: sglang
|
| 329 |
+
version_spec: ">=0.4.0"
|
| 330 |
+
matches_model_type: llama
|
| 331 |
+
support: full
|
| 332 |
+
verification_level: cited
|
| 333 |
+
required_flags: []
|
| 334 |
+
optional_flags: []
|
| 335 |
+
sources:
|
| 336 |
+
- type: release_notes
|
| 337 |
+
url: "https://github.com/sgl-project/sglang"
|
| 338 |
+
captured_date: "2026-04-24"
|
| 339 |
+
caveats_en: []
|
| 340 |
+
caveats_zh: []
|
| 341 |
+
|
| 342 |
+
- engine: sglang
|
| 343 |
+
version_spec: ">=0.4.0"
|
| 344 |
+
matches_model_type: qwen3
|
| 345 |
+
support: full
|
| 346 |
+
verification_level: cited
|
| 347 |
+
required_flags: []
|
| 348 |
+
optional_flags: []
|
| 349 |
+
sources:
|
| 350 |
+
- type: release_notes
|
| 351 |
+
url: "https://docs.sglang.io/basic_usage/popular_model_usage.html"
|
| 352 |
+
captured_date: "2026-04-24"
|
| 353 |
+
caveats_en: []
|
| 354 |
+
caveats_zh: []
|
| 355 |
+
|
| 356 |
+
- engine: sglang
|
| 357 |
+
version_spec: ">=0.4.0"
|
| 358 |
+
matches_model_type: mixtral
|
| 359 |
+
support: full
|
| 360 |
+
verification_level: cited
|
| 361 |
+
required_flags: []
|
| 362 |
+
optional_flags: []
|
| 363 |
+
sources:
|
| 364 |
+
- type: release_notes
|
| 365 |
+
url: "https://github.com/sgl-project/sglang"
|
| 366 |
+
captured_date: "2026-04-24"
|
| 367 |
+
caveats_en: []
|
| 368 |
+
caveats_zh: []
|
| 369 |
+
|
| 370 |
+
- engine: sglang
|
| 371 |
+
version_spec: ">=0.4.0"
|
| 372 |
+
matches_model_type: qwen2
|
| 373 |
+
support: full
|
| 374 |
+
verification_level: cited
|
| 375 |
+
required_flags: []
|
| 376 |
+
optional_flags: []
|
| 377 |
+
sources:
|
| 378 |
+
- type: release_notes
|
| 379 |
+
url: "https://docs.sglang.io/basic_usage/popular_model_usage.html"
|
| 380 |
+
captured_date: "2026-04-24"
|
| 381 |
+
caveats_en: []
|
| 382 |
+
caveats_zh: []
|
| 383 |
+
|
| 384 |
+
- engine: sglang
|
| 385 |
+
version_spec: ">=0.4.0"
|
| 386 |
+
matches_model_type: qwen2_moe
|
| 387 |
+
support: full
|
| 388 |
+
verification_level: cited
|
| 389 |
+
required_flags: []
|
| 390 |
+
optional_flags: []
|
| 391 |
+
sources:
|
| 392 |
+
- type: release_notes
|
| 393 |
+
url: "https://docs.sglang.io/basic_usage/popular_model_usage.html"
|
| 394 |
+
captured_date: "2026-04-24"
|
| 395 |
+
caveats_en: []
|
| 396 |
+
caveats_zh: []
|
| 397 |
+
|
| 398 |
+
- engine: sglang
|
| 399 |
+
version_spec: ">=0.4.0"
|
| 400 |
+
matches_model_type: qwen3_moe
|
| 401 |
+
support: full
|
| 402 |
+
verification_level: cited
|
| 403 |
+
required_flags: []
|
| 404 |
+
optional_flags: []
|
| 405 |
+
sources:
|
| 406 |
+
- type: release_notes
|
| 407 |
+
url: "https://docs.sglang.io/basic_usage/popular_model_usage.html"
|
| 408 |
+
captured_date: "2026-04-24"
|
| 409 |
+
caveats_en: []
|
| 410 |
+
caveats_zh: []
|
| 411 |
+
|
| 412 |
+
- engine: sglang
|
| 413 |
+
version_spec: ">=0.4.0"
|
| 414 |
+
matches_model_type: mistral
|
| 415 |
+
support: full
|
| 416 |
+
verification_level: cited
|
| 417 |
+
required_flags: []
|
| 418 |
+
optional_flags: []
|
| 419 |
+
sources:
|
| 420 |
+
- type: release_notes
|
| 421 |
+
url: "https://github.com/sgl-project/sglang"
|
| 422 |
+
captured_date: "2026-04-24"
|
| 423 |
+
caveats_en: []
|
| 424 |
+
caveats_zh: []
|
| 425 |
+
|
| 426 |
+
- engine: sglang
|
| 427 |
+
version_spec: ">=0.4.0"
|
| 428 |
+
matches_model_type: gemma
|
| 429 |
+
support: full
|
| 430 |
+
verification_level: cited
|
| 431 |
+
required_flags: []
|
| 432 |
+
optional_flags: []
|
| 433 |
+
sources:
|
| 434 |
+
- type: release_notes
|
| 435 |
+
url: "https://docs.sglang.io/basic_usage/popular_model_usage.html"
|
| 436 |
+
captured_date: "2026-04-24"
|
| 437 |
+
caveats_en: []
|
| 438 |
+
caveats_zh: []
|
| 439 |
+
|
| 440 |
+
- engine: sglang
|
| 441 |
+
version_spec: ">=0.4.0"
|
| 442 |
+
matches_model_type: gemma2
|
| 443 |
+
support: full
|
| 444 |
+
verification_level: cited
|
| 445 |
+
required_flags: []
|
| 446 |
+
optional_flags: []
|
| 447 |
+
sources:
|
| 448 |
+
- type: release_notes
|
| 449 |
+
url: "https://docs.sglang.io/basic_usage/popular_model_usage.html"
|
| 450 |
+
captured_date: "2026-04-24"
|
| 451 |
+
caveats_en: []
|
| 452 |
+
caveats_zh: []
|
| 453 |
+
|
| 454 |
+
- engine: sglang
|
| 455 |
+
version_spec: ">=0.5.0"
|
| 456 |
+
matches_model_type: gemma3
|
| 457 |
+
support: full
|
| 458 |
+
verification_level: cited
|
| 459 |
+
required_flags: []
|
| 460 |
+
optional_flags: []
|
| 461 |
+
sources:
|
| 462 |
+
- type: release_notes
|
| 463 |
+
url: "https://docs.sglang.io/basic_usage/popular_model_usage.html"
|
| 464 |
+
captured_date: "2026-04-24"
|
| 465 |
+
caveats_en: []
|
| 466 |
+
caveats_zh: []
|
| 467 |
+
|
| 468 |
+
- engine: sglang
|
| 469 |
+
version_spec: ">=0.4.0"
|
| 470 |
+
matches_model_type: phi
|
| 471 |
+
support: full
|
| 472 |
+
verification_level: cited
|
| 473 |
+
required_flags: []
|
| 474 |
+
optional_flags: []
|
| 475 |
+
sources:
|
| 476 |
+
- type: release_notes
|
| 477 |
+
url: "https://docs.sglang.io/basic_usage/popular_model_usage.html"
|
| 478 |
+
captured_date: "2026-04-24"
|
| 479 |
+
caveats_en: []
|
| 480 |
+
caveats_zh: []
|
| 481 |
+
|
| 482 |
+
- engine: sglang
|
| 483 |
+
version_spec: ">=0.4.0"
|
| 484 |
+
matches_model_type: phi3
|
| 485 |
+
support: full
|
| 486 |
+
verification_level: cited
|
| 487 |
+
required_flags: []
|
| 488 |
+
optional_flags: []
|
| 489 |
+
sources:
|
| 490 |
+
- type: release_notes
|
| 491 |
+
url: "https://docs.sglang.io/basic_usage/popular_model_usage.html"
|
| 492 |
+
captured_date: "2026-04-24"
|
| 493 |
+
caveats_en: []
|
| 494 |
+
caveats_zh: []
|
| 495 |
+
|
| 496 |
+
- engine: sglang
|
| 497 |
+
version_spec: ">=0.4.0"
|
| 498 |
+
matches_model_type: deepseek_v2
|
| 499 |
+
support: full
|
| 500 |
+
verification_level: cited
|
| 501 |
+
required_flags: []
|
| 502 |
+
optional_flags:
|
| 503 |
+
- flag: "--trust-remote-code"
|
| 504 |
+
value: null
|
| 505 |
+
note_en: "Required for DeepSeek V2 custom modeling code."
|
| 506 |
+
note_zh: "加载 DeepSeek V2 的自定义建模代码。"
|
| 507 |
+
sources:
|
| 508 |
+
- type: release_notes
|
| 509 |
+
url: "https://github.com/sgl-project/sglang"
|
| 510 |
+
captured_date: "2026-04-24"
|
| 511 |
+
caveats_en: []
|
| 512 |
+
caveats_zh: []
|
src/llm_cal/fleet/__init__.py
ADDED
|
File without changes
|
src/llm_cal/fleet/planner.py
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Fleet planner — reverse-inference of "how many GPUs do I need".
|
| 2 |
+
|
| 3 |
+
Three tiers:
|
| 4 |
+
* min — just enough to hold weights + light overhead
|
| 5 |
+
(can run single requests at short context)
|
| 6 |
+
* dev — room for ~8 concurrent at 128K context
|
| 7 |
+
* prod — room for ~16 concurrent at 128K context
|
| 8 |
+
|
| 9 |
+
TP-divisibility constraint (CRITICAL regression test): the number of attention
|
| 10 |
+
heads must be divisible by the number of GPUs. vLLM/SGLang with TP=3 on a
|
| 11 |
+
64-head model would fail to start; we only recommend counts in the valid set.
|
| 12 |
+
|
| 13 |
+
Reserved overhead per GPU = 10% of HBM (CUDA context + activations + framework),
|
| 14 |
+
which matches `--gpu-memory-utilization 0.9` in vLLM.
|
| 15 |
+
|
| 16 |
+
Per-GPU KV modeling is TP-aware:
|
| 17 |
+
|
| 18 |
+
per_gpu_KV = total_KV / min(tp_size, max(1, num_kv_heads))
|
| 19 |
+
|
| 20 |
+
* MQA (kv_heads=1): KV replicates fully across ranks → divisor is 1,
|
| 21 |
+
per-GPU KV = total (accurate for DeepSeek V4-Flash, Qwen MQA variants).
|
| 22 |
+
* GQA (kv_heads=8): KV splits across ranks up to num_kv_heads → at TP=8,
|
| 23 |
+
per-GPU KV = total/8 (accurate for Llama 3 70B, Qwen 72B).
|
| 24 |
+
* MHA: splits fully up to num_heads.
|
| 25 |
+
|
| 26 |
+
This matches vLLM/SGLang's actual sharding behavior. MLA-latent KV is
|
| 27 |
+
technically replicated in most frameworks, but since num_kv_heads is
|
| 28 |
+
typically 1 in MLA (DeepSeek V2/V3/V4), the formula degenerates to
|
| 29 |
+
replication anyway.
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
from __future__ import annotations
|
| 33 |
+
|
| 34 |
+
import math
|
| 35 |
+
from dataclasses import dataclass
|
| 36 |
+
from typing import Literal
|
| 37 |
+
|
| 38 |
+
from llm_cal.architecture.profile import ArchitectureProfile
|
| 39 |
+
from llm_cal.hardware.loader import GPUSpec
|
| 40 |
+
|
| 41 |
+
Tier = Literal["min", "dev", "prod"]
|
| 42 |
+
|
| 43 |
+
_OVERHEAD_FRACTION = 0.10
|
| 44 |
+
_KV_HEAD_ROOM_CONCURRENT: dict[Tier, int] = {
|
| 45 |
+
"min": 1, # one request worth of KV at 128K
|
| 46 |
+
"dev": 8,
|
| 47 |
+
"prod": 16,
|
| 48 |
+
}
|
| 49 |
+
# For recommendation logic, compute per-GPU fit at this reference context length.
|
| 50 |
+
_REFERENCE_CTX_TOKENS = 131_072
|
| 51 |
+
# Max recommended TP within a single 8-GPU node. Beyond this we'd want PP/EP,
|
| 52 |
+
# which is out of v0.1 scope.
|
| 53 |
+
_MAX_TP_SINGLE_NODE = 8
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
@dataclass(frozen=True)
|
| 57 |
+
class FleetOption:
|
| 58 |
+
tier: Tier
|
| 59 |
+
gpu_count: int
|
| 60 |
+
weight_bytes_per_gpu: int
|
| 61 |
+
kv_bytes_per_request: int # at reference context (128K)
|
| 62 |
+
max_concurrent_at_reference_ctx: int
|
| 63 |
+
# concurrency ceiling at each context length the user asked about.
|
| 64 |
+
# Key = context token count, value = max concurrent requests that fit.
|
| 65 |
+
max_concurrent_by_context: tuple[tuple[int, int], ...]
|
| 66 |
+
usable_bytes_per_gpu: int
|
| 67 |
+
fits: bool # False => the best we can do still overflows headroom at reference ctx
|
| 68 |
+
reason_en: str
|
| 69 |
+
reason_zh: str
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
@dataclass(frozen=True)
|
| 73 |
+
class FleetRecommendation:
|
| 74 |
+
options: tuple[FleetOption, ...]
|
| 75 |
+
best_tier: Tier
|
| 76 |
+
valid_tp_sizes: tuple[int, ...]
|
| 77 |
+
constraint_note_en: str
|
| 78 |
+
constraint_note_zh: str
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def plan(
|
| 82 |
+
profile: ArchitectureProfile,
|
| 83 |
+
weight_bytes: int,
|
| 84 |
+
kv_bytes_per_request_at_ref: int,
|
| 85 |
+
gpu: GPUSpec,
|
| 86 |
+
forced_gpu_count: int | None = None,
|
| 87 |
+
kv_bytes_by_context: dict[int, int] | None = None,
|
| 88 |
+
) -> FleetRecommendation:
|
| 89 |
+
"""Recommend GPU counts for the three tiers, or a single option when forced.
|
| 90 |
+
|
| 91 |
+
`kv_bytes_by_context` is optional metadata used only for the per-option
|
| 92 |
+
concurrency breakdown (e.g. "~23 concurrent @ 128K, ~2 @ 1M"). Tier-fit
|
| 93 |
+
decisions still use `kv_bytes_per_request_at_ref` (the reference context).
|
| 94 |
+
"""
|
| 95 |
+
kv_by_ctx = kv_bytes_by_context or {}
|
| 96 |
+
bytes_per_gpu_total = gpu.memory_gb * 1_000_000_000
|
| 97 |
+
usable_per_gpu = int(bytes_per_gpu_total * (1 - _OVERHEAD_FRACTION))
|
| 98 |
+
valid_tp = _valid_tp_sizes(profile)
|
| 99 |
+
|
| 100 |
+
constraint_en = _constraint_note_en(profile, valid_tp)
|
| 101 |
+
constraint_zh = _constraint_note_zh(profile, valid_tp)
|
| 102 |
+
|
| 103 |
+
if forced_gpu_count is not None:
|
| 104 |
+
option = _evaluate_count(
|
| 105 |
+
forced_gpu_count,
|
| 106 |
+
profile=profile,
|
| 107 |
+
weight_bytes=weight_bytes,
|
| 108 |
+
kv_bytes=kv_bytes_per_request_at_ref,
|
| 109 |
+
usable_per_gpu=usable_per_gpu,
|
| 110 |
+
valid_tp=valid_tp,
|
| 111 |
+
tier="dev", # generic label when user forced
|
| 112 |
+
kv_by_context=kv_by_ctx,
|
| 113 |
+
)
|
| 114 |
+
return FleetRecommendation(
|
| 115 |
+
options=(option,),
|
| 116 |
+
best_tier="dev",
|
| 117 |
+
valid_tp_sizes=tuple(valid_tp),
|
| 118 |
+
constraint_note_en=constraint_en,
|
| 119 |
+
constraint_note_zh=constraint_zh,
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
options: list[FleetOption] = []
|
| 123 |
+
for tier in ("min", "dev", "prod"):
|
| 124 |
+
gpu_count = _smallest_fitting_count(
|
| 125 |
+
valid_tp,
|
| 126 |
+
profile=profile,
|
| 127 |
+
weight_bytes=weight_bytes,
|
| 128 |
+
kv_bytes=kv_bytes_per_request_at_ref,
|
| 129 |
+
usable_per_gpu=usable_per_gpu,
|
| 130 |
+
concurrent=_KV_HEAD_ROOM_CONCURRENT[tier],
|
| 131 |
+
)
|
| 132 |
+
# Fall back to the largest TP if nothing fits — flagged as `fits=False`.
|
| 133 |
+
chosen = gpu_count if gpu_count is not None else max(valid_tp)
|
| 134 |
+
option = _evaluate_count(
|
| 135 |
+
chosen,
|
| 136 |
+
profile=profile,
|
| 137 |
+
weight_bytes=weight_bytes,
|
| 138 |
+
kv_bytes=kv_bytes_per_request_at_ref,
|
| 139 |
+
usable_per_gpu=usable_per_gpu,
|
| 140 |
+
valid_tp=valid_tp,
|
| 141 |
+
tier=tier,
|
| 142 |
+
kv_by_context=kv_by_ctx,
|
| 143 |
+
)
|
| 144 |
+
options.append(option)
|
| 145 |
+
|
| 146 |
+
# Best tier: dev if it fits, otherwise min, otherwise whatever exists
|
| 147 |
+
best = "dev" if options[1].fits else ("min" if options[0].fits else "prod")
|
| 148 |
+
return FleetRecommendation(
|
| 149 |
+
options=tuple(options),
|
| 150 |
+
best_tier=best, # type: ignore[arg-type]
|
| 151 |
+
valid_tp_sizes=tuple(valid_tp),
|
| 152 |
+
constraint_note_en=constraint_en,
|
| 153 |
+
constraint_note_zh=constraint_zh,
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def _valid_tp_sizes(profile: ArchitectureProfile) -> list[int]:
|
| 158 |
+
"""Divisors of num_heads, capped at the single-node maximum."""
|
| 159 |
+
if profile.attention is None or profile.attention.num_heads <= 0:
|
| 160 |
+
return [1]
|
| 161 |
+
h = profile.attention.num_heads
|
| 162 |
+
divisors = [i for i in range(1, min(h, _MAX_TP_SINGLE_NODE) + 1) if h % i == 0]
|
| 163 |
+
return divisors or [1]
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def _kv_shards(profile: ArchitectureProfile, tp_size: int) -> int:
|
| 167 |
+
"""How many ways KV cache can be split across TP ranks.
|
| 168 |
+
|
| 169 |
+
Saturates at num_kv_heads: once tp_size > num_kv_heads, extra ranks
|
| 170 |
+
just replicate, so the divisor stops growing.
|
| 171 |
+
"""
|
| 172 |
+
if profile.attention is None:
|
| 173 |
+
return 1
|
| 174 |
+
kv_heads = max(1, profile.attention.num_kv_heads)
|
| 175 |
+
return min(tp_size, kv_heads)
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def _smallest_fitting_count(
|
| 179 |
+
valid_tp: list[int],
|
| 180 |
+
*,
|
| 181 |
+
profile: ArchitectureProfile,
|
| 182 |
+
weight_bytes: int,
|
| 183 |
+
kv_bytes: int,
|
| 184 |
+
usable_per_gpu: int,
|
| 185 |
+
concurrent: int,
|
| 186 |
+
) -> int | None:
|
| 187 |
+
for n in valid_tp:
|
| 188 |
+
if _fits(n, profile, weight_bytes, kv_bytes, usable_per_gpu, concurrent):
|
| 189 |
+
return n
|
| 190 |
+
return None
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
def _fits(
|
| 194 |
+
gpu_count: int,
|
| 195 |
+
profile: ArchitectureProfile,
|
| 196 |
+
weight_bytes: int,
|
| 197 |
+
kv_bytes: int,
|
| 198 |
+
usable_per_gpu: int,
|
| 199 |
+
concurrent: int,
|
| 200 |
+
) -> bool:
|
| 201 |
+
weight_per_gpu = math.ceil(weight_bytes / gpu_count)
|
| 202 |
+
shards = _kv_shards(profile, gpu_count)
|
| 203 |
+
kv_per_gpu = math.ceil(kv_bytes / shards)
|
| 204 |
+
needed = weight_per_gpu + concurrent * kv_per_gpu
|
| 205 |
+
return needed <= usable_per_gpu
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
def _evaluate_count(
|
| 209 |
+
gpu_count: int,
|
| 210 |
+
*,
|
| 211 |
+
profile: ArchitectureProfile,
|
| 212 |
+
weight_bytes: int,
|
| 213 |
+
kv_bytes: int,
|
| 214 |
+
usable_per_gpu: int,
|
| 215 |
+
valid_tp: list[int],
|
| 216 |
+
tier: Tier,
|
| 217 |
+
kv_by_context: dict[int, int],
|
| 218 |
+
) -> FleetOption:
|
| 219 |
+
weight_per_gpu = math.ceil(weight_bytes / gpu_count)
|
| 220 |
+
shards = _kv_shards(profile, gpu_count)
|
| 221 |
+
kv_per_gpu = math.ceil(kv_bytes / shards)
|
| 222 |
+
headroom = usable_per_gpu - weight_per_gpu
|
| 223 |
+
max_concurrent = max(0, headroom // kv_per_gpu) if kv_per_gpu > 0 else 0
|
| 224 |
+
# Per-context concurrency, sorted by context length ascending, each using
|
| 225 |
+
# the TP-sharded per-GPU KV.
|
| 226 |
+
max_concurrent_by_ctx = tuple(
|
| 227 |
+
(
|
| 228 |
+
ctx,
|
| 229 |
+
(max(0, headroom // math.ceil(kv / shards)) if kv > 0 else 0),
|
| 230 |
+
)
|
| 231 |
+
for ctx, kv in sorted(kv_by_context.items())
|
| 232 |
+
)
|
| 233 |
+
fits = _fits(
|
| 234 |
+
gpu_count,
|
| 235 |
+
profile,
|
| 236 |
+
weight_bytes,
|
| 237 |
+
kv_bytes,
|
| 238 |
+
usable_per_gpu,
|
| 239 |
+
_KV_HEAD_ROOM_CONCURRENT[tier],
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
# Reason strings
|
| 243 |
+
if gpu_count not in valid_tp:
|
| 244 |
+
reason_en = (
|
| 245 |
+
f"GPU count {gpu_count} does not divide num_heads — valid TP sizes: {sorted(valid_tp)}"
|
| 246 |
+
)
|
| 247 |
+
reason_zh = f"GPU 张数 {gpu_count} 无法整除注意力头数——有效 TP 张数:{sorted(valid_tp)}"
|
| 248 |
+
elif not fits:
|
| 249 |
+
reason_en = (
|
| 250 |
+
f"Weights + {_KV_HEAD_ROOM_CONCURRENT[tier]}x KV would exceed "
|
| 251 |
+
f"{usable_per_gpu / 1e9:.1f} GB usable per GPU"
|
| 252 |
+
)
|
| 253 |
+
reason_zh = (
|
| 254 |
+
f"权重 + {_KV_HEAD_ROOM_CONCURRENT[tier]} 份 KV 超过单卡可用的 "
|
| 255 |
+
f"{usable_per_gpu / 1e9:.1f} GB"
|
| 256 |
+
)
|
| 257 |
+
else:
|
| 258 |
+
reason_en = f"fits ~{max_concurrent} concurrent @ {_REFERENCE_CTX_TOKENS // 1024}K ctx"
|
| 259 |
+
reason_zh = f"可容纳约 {max_concurrent} 并发请求 @ {_REFERENCE_CTX_TOKENS // 1024}K 上下文"
|
| 260 |
+
|
| 261 |
+
return FleetOption(
|
| 262 |
+
tier=tier,
|
| 263 |
+
gpu_count=gpu_count,
|
| 264 |
+
weight_bytes_per_gpu=weight_per_gpu,
|
| 265 |
+
kv_bytes_per_request=kv_bytes,
|
| 266 |
+
max_concurrent_at_reference_ctx=max_concurrent,
|
| 267 |
+
max_concurrent_by_context=max_concurrent_by_ctx,
|
| 268 |
+
usable_bytes_per_gpu=usable_per_gpu,
|
| 269 |
+
fits=fits,
|
| 270 |
+
reason_en=reason_en,
|
| 271 |
+
reason_zh=reason_zh,
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
def _constraint_note_en(profile: ArchitectureProfile, valid_tp: list[int]) -> str:
|
| 276 |
+
heads = profile.attention.num_heads if profile.attention else 0
|
| 277 |
+
return f"TP must divide num_heads={heads}. Candidates within one node (<=8 GPUs): {valid_tp}."
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
def _constraint_note_zh(profile: ArchitectureProfile, valid_tp: list[int]) -> str:
|
| 281 |
+
heads = profile.attention.num_heads if profile.attention else 0
|
| 282 |
+
return f"TP 张数必须整除 num_heads={heads}。单节点(≤8 卡)候选:{valid_tp}。"
|
src/llm_cal/hardware/__init__.py
ADDED
|
File without changes
|
src/llm_cal/hardware/gpu_database.yaml
ADDED
|
@@ -0,0 +1,613 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# GPU database — v0.1.
|
| 2 |
+
#
|
| 3 |
+
# DATA PROVENANCE:
|
| 4 |
+
# Numeric specs (memory_gb, nvlink_bandwidth_gbps, fp16_tflops, fp8/fp4_support)
|
| 5 |
+
# come from public vendor datasheets and commonly-cited benchmarks. Each entry
|
| 6 |
+
# records its source in `spec_source` so users can audit.
|
| 7 |
+
#
|
| 8 |
+
# Conventions:
|
| 9 |
+
# - memory_gb: per-card HBM / GDDR in GB (vendor nominal)
|
| 10 |
+
# - nvlink_bandwidth_gbps: aggregate NVLink (or equivalent like xGMI/HCCS)
|
| 11 |
+
# bandwidth. 0 if the GPU has no high-bandwidth interconnect (e.g. consumer
|
| 12 |
+
# Ada removed NVLink).
|
| 13 |
+
# - fp16_tflops: peak dense FP16/BF16 with Tensor Cores; vendor cited figure.
|
| 14 |
+
# - fp8_support / fp4_support: whether the GPU has NATIVE Tensor Core
|
| 15 |
+
# acceleration for that precision. Software emulation does NOT count.
|
| 16 |
+
#
|
| 17 |
+
# To add a new GPU: append an entry with all required fields + spec_source.
|
| 18 |
+
# See docs/architecture-guide.md "How to add a new GPU".
|
| 19 |
+
schema_version: 1
|
| 20 |
+
gpus:
|
| 21 |
+
# ========================================================================
|
| 22 |
+
# NVIDIA Blackwell (2024+) — native FP4
|
| 23 |
+
# ========================================================================
|
| 24 |
+
- id: B200
|
| 25 |
+
aliases: [B200-SXM, B200-192G]
|
| 26 |
+
memory_gb: 192
|
| 27 |
+
nvlink_bandwidth_gbps: 1800
|
| 28 |
+
memory_bandwidth_gbps: 8000
|
| 29 |
+
fp16_tflops: 2250
|
| 30 |
+
fp8_support: true
|
| 31 |
+
fp4_support: true
|
| 32 |
+
spec_source: "NVIDIA Blackwell architecture overview (nvidia.com/blackwell)"
|
| 33 |
+
notes_en: "Blackwell flagship. Native FP4 Tensor Cores. First GPU that accelerates DeepSeek-V4-Flash-style FP4 at hardware level."
|
| 34 |
+
notes_zh: "Blackwell 旗舰。原生 FP4 Tensor Core,首款在硬件层加速 DeepSeek-V4-Flash 类 FP4 模型的 GPU。"
|
| 35 |
+
|
| 36 |
+
# ========================================================================
|
| 37 |
+
# NVIDIA Hopper (2022+)
|
| 38 |
+
# ========================================================================
|
| 39 |
+
- id: H100
|
| 40 |
+
aliases: [H100-SXM5, H100-80G, H100-SXM]
|
| 41 |
+
memory_gb: 80
|
| 42 |
+
nvlink_bandwidth_gbps: 900
|
| 43 |
+
memory_bandwidth_gbps: 3350
|
| 44 |
+
fp16_tflops: 989
|
| 45 |
+
fp8_support: true
|
| 46 |
+
fp4_support: false
|
| 47 |
+
spec_source: "NVIDIA H100 datasheet (nvidia.com/h100)"
|
| 48 |
+
notes_en: "Hopper flagship. Full NVLink."
|
| 49 |
+
notes_zh: "Hopper 架构旗舰,完整 NVLink 带宽。"
|
| 50 |
+
|
| 51 |
+
- id: H800
|
| 52 |
+
aliases: [H800-SXM5, H800-80G]
|
| 53 |
+
memory_gb: 80
|
| 54 |
+
nvlink_bandwidth_gbps: 400
|
| 55 |
+
memory_bandwidth_gbps: 3350
|
| 56 |
+
fp16_tflops: 989
|
| 57 |
+
fp8_support: true
|
| 58 |
+
fp4_support: false
|
| 59 |
+
spec_source: "NVIDIA H800 compliance variant — NVLink halved from H100 per US export controls"
|
| 60 |
+
notes_en: "China-regulated H100 variant. NVLink bandwidth halved (400 vs 900). Same HBM and compute as H100."
|
| 61 |
+
notes_zh: "H100 的中国合规版本。NVLink 带宽减半(400 vs 900 GB/s),HBM 容量和算力与 H100 相同。"
|
| 62 |
+
|
| 63 |
+
- id: H200
|
| 64 |
+
aliases: [H200-SXM, H200-141G]
|
| 65 |
+
memory_gb: 141
|
| 66 |
+
nvlink_bandwidth_gbps: 900
|
| 67 |
+
memory_bandwidth_gbps: 4800
|
| 68 |
+
fp16_tflops: 989
|
| 69 |
+
fp8_support: true
|
| 70 |
+
fp4_support: false
|
| 71 |
+
spec_source: "NVIDIA H200 datasheet (nvidia.com/h200)"
|
| 72 |
+
notes_en: "Hopper with HBM3e. 141 GB per GPU."
|
| 73 |
+
notes_zh: "搭载 HBM3e 的 Hopper,单卡 141 GB。"
|
| 74 |
+
|
| 75 |
+
- id: GH200
|
| 76 |
+
aliases: [Grace-Hopper, GH200-144G, GH200-96G]
|
| 77 |
+
memory_gb: 144
|
| 78 |
+
nvlink_bandwidth_gbps: 900
|
| 79 |
+
memory_bandwidth_gbps: 4800
|
| 80 |
+
fp16_tflops: 989
|
| 81 |
+
fp8_support: true
|
| 82 |
+
fp4_support: false
|
| 83 |
+
spec_source: "NVIDIA GH200 Grace Hopper datasheet 2023 (144GB HBM3e variant, dense FP16=989 TFLOPS; sparsity doubles it)"
|
| 84 |
+
notes_en: "Grace Hopper superchip — Hopper GPU + Grace CPU on one module. 144 GB HBM3e (96 GB HBM3 variant also exists). NVLink-C2C 900 GB/s CPU<->GPU unified. TDP programmable 450-1000W. Ideal for models that spill beyond single GPU memory because GPU can access CPU LPDDR coherently."
|
| 85 |
+
notes_zh: "Grace Hopper 超级芯片 — Hopper GPU + Grace CPU 融合模组。144 GB HBM3e(另有 96 GB HBM3 版本)。NVLink-C2C 让 CPU/GPU 共享统一内存空间,900 GB/s 双向。TDP 可编程 450-1000W。模型单卡显存装不下时,可一致地访问 CPU 的 LPDDR。"
|
| 86 |
+
|
| 87 |
+
- id: GB200
|
| 88 |
+
aliases: [Grace-Blackwell, GB200-per-GPU]
|
| 89 |
+
memory_gb: 192
|
| 90 |
+
nvlink_bandwidth_gbps: 1800
|
| 91 |
+
memory_bandwidth_gbps: 8000
|
| 92 |
+
fp16_tflops: 2250
|
| 93 |
+
fp8_support: true
|
| 94 |
+
fp4_support: true
|
| 95 |
+
spec_source: "NVIDIA GB200 Superchip datasheet 2024 — per-GPU view. Each GB200 = 2 B200 + Grace CPU. Per B200: 192 GB HBM3e, 8 TB/s, 2250 TFLOPS dense FP16 (4500 sparsity). Grace CPU adds up to 480 GB LPDDR5x accessible via NVLink-C2C."
|
| 96 |
+
notes_en: "Grace Blackwell superchip — 2 B200 GPUs + Grace CPU on one module. Per-GPU specs here match B200, but each GB200 module unlocks 384 GB HBM3e total (192+192) plus coherent access to 480 GB Grace CPU LPDDR5x. FP4 native. Only deployable in NVL4/NVL72 rack-scale systems with liquid cooling. Per-GPU TDP 1200W."
|
| 97 |
+
notes_zh: "Grace Blackwell 超级芯片 — 双 B200 GPU + Grace CPU 融合。此处展示单 GPU 视角规格,与 B200 基本一致。每块 GB200 模组合计 384 GB HBM3e(双卡),并通过 NVLink-C2C 一致访问 480 GB Grace CPU 的 LPDDR5x。原生 FP4。仅在 NVL4 / NVL72 液冷机架系统中部署。单 GPU TDP 1200W。"
|
| 98 |
+
|
| 99 |
+
- id: H20
|
| 100 |
+
aliases: [H20-96G, H20-SXM]
|
| 101 |
+
memory_gb: 96
|
| 102 |
+
nvlink_bandwidth_gbps: 900
|
| 103 |
+
memory_bandwidth_gbps: 4000
|
| 104 |
+
fp16_tflops: 148
|
| 105 |
+
fp8_support: true
|
| 106 |
+
fp4_support: false
|
| 107 |
+
spec_source: "NVIDIA H20 — released 2024 as China-compliant successor to H800. Compute heavily reduced (~15% of H100); memory bandwidth and HBM3e preserved."
|
| 108 |
+
notes_en: "China-compliance Hopper post-Oct-2023 export rules. Compute ~15% of H100 (148 vs 989 TFLOPS), but HBM3e memory bandwidth preserved. Good for memory-bound LLM inference, poor for training."
|
| 109 |
+
notes_zh: "2023 年 10 月出口管制后的中国合规 Hopper。算力仅为 H100 的约 15%(148 vs 989 TFLOPS),但 HBM3e 显存带宽保留。推理(显存带宽受限)尚可,训练基本不实用。"
|
| 110 |
+
|
| 111 |
+
# ========================================================================
|
| 112 |
+
# NVIDIA Ada Lovelace (datacenter) — FP8 yes, NVLink no
|
| 113 |
+
# ========================================================================
|
| 114 |
+
- id: L40S
|
| 115 |
+
aliases: [L40-S, L40S-48G]
|
| 116 |
+
memory_gb: 48
|
| 117 |
+
nvlink_bandwidth_gbps: 0
|
| 118 |
+
memory_bandwidth_gbps: 864
|
| 119 |
+
fp16_tflops: 362
|
| 120 |
+
fp8_support: true
|
| 121 |
+
fp4_support: false
|
| 122 |
+
spec_source: "NVIDIA L40S datasheet 2023"
|
| 123 |
+
notes_en: "Ada datacenter. 48 GB GDDR6. No NVLink — multi-GPU setups rely on PCIe. Cost-effective for small/medium model inference."
|
| 124 |
+
notes_zh: "Ada 架构数据中心卡,48 GB GDDR6。无 NVLink,多卡需走 PCIe。中小模型推理性价比高。"
|
| 125 |
+
|
| 126 |
+
- id: L40
|
| 127 |
+
aliases: [L40-48G]
|
| 128 |
+
memory_gb: 48
|
| 129 |
+
nvlink_bandwidth_gbps: 0
|
| 130 |
+
memory_bandwidth_gbps: 864
|
| 131 |
+
fp16_tflops: 181
|
| 132 |
+
fp8_support: true
|
| 133 |
+
fp4_support: false
|
| 134 |
+
spec_source: "NVIDIA L40 datasheet 2022"
|
| 135 |
+
notes_en: "Ada datacenter predecessor to L40S. Same 48 GB, half the compute. Widely deployed in enterprise clouds."
|
| 136 |
+
notes_zh: "L40S 的前代,Ada 架构数据中心卡。同为 48 GB,算力减半。企业私有云部署量较大。"
|
| 137 |
+
|
| 138 |
+
- id: L4
|
| 139 |
+
aliases: [L4-24G]
|
| 140 |
+
memory_gb: 24
|
| 141 |
+
nvlink_bandwidth_gbps: 0
|
| 142 |
+
memory_bandwidth_gbps: 300
|
| 143 |
+
fp16_tflops: 121
|
| 144 |
+
fp8_support: true
|
| 145 |
+
fp4_support: false
|
| 146 |
+
spec_source: "NVIDIA L4 datasheet 2023"
|
| 147 |
+
notes_en: "Low-profile Ada, 24 GB GDDR6. Common in low-concurrency inference / transcoding. No NVLink."
|
| 148 |
+
notes_zh: "低功耗 Ada,24 GB GDDR6。常用于低并发推理和转码场景。无 NVLink。"
|
| 149 |
+
|
| 150 |
+
- id: RTX6000-Ada
|
| 151 |
+
aliases: [RTX-6000-Ada, RTX6000Ada, L6000]
|
| 152 |
+
memory_gb: 48
|
| 153 |
+
nvlink_bandwidth_gbps: 0
|
| 154 |
+
memory_bandwidth_gbps: 960
|
| 155 |
+
fp16_tflops: 365
|
| 156 |
+
fp8_support: true
|
| 157 |
+
fp4_support: false
|
| 158 |
+
spec_source: "NVIDIA RTX 6000 Ada datasheet 2022"
|
| 159 |
+
notes_en: "Ada Pro workstation. 48 GB, similar to L40S but for workstations. FP8 yes, no NVLink."
|
| 160 |
+
notes_zh: "Ada Pro 工作站卡。48 GB,规格接近 L40S 但面向工作站。支持 FP8,无 NVLink。"
|
| 161 |
+
|
| 162 |
+
- id: RTX4090
|
| 163 |
+
aliases: ["4090", RTX-4090]
|
| 164 |
+
memory_gb: 24
|
| 165 |
+
nvlink_bandwidth_gbps: 0
|
| 166 |
+
memory_bandwidth_gbps: 1008
|
| 167 |
+
fp16_tflops: 165
|
| 168 |
+
fp8_support: true
|
| 169 |
+
fp4_support: false
|
| 170 |
+
spec_source: "NVIDIA RTX 4090 datasheet 2022"
|
| 171 |
+
notes_en: "Consumer Ada. No NVLink. Large models need multi-GPU via PCIe (slower)."
|
| 172 |
+
notes_zh: "消费级 Ada 架构,无 NVLink。大模型多卡只能走 PCIe(明显更慢)。"
|
| 173 |
+
|
| 174 |
+
# ========================================================================
|
| 175 |
+
# NVIDIA Ampere (2020+)
|
| 176 |
+
# ========================================================================
|
| 177 |
+
- id: A100-80G
|
| 178 |
+
aliases: [A100-80, A100-SXM-80G]
|
| 179 |
+
memory_gb: 80
|
| 180 |
+
nvlink_bandwidth_gbps: 600
|
| 181 |
+
memory_bandwidth_gbps: 2039
|
| 182 |
+
fp16_tflops: 312
|
| 183 |
+
fp8_support: false
|
| 184 |
+
fp4_support: false
|
| 185 |
+
spec_source: "NVIDIA A100 datasheet 2020"
|
| 186 |
+
notes_en: "Ampere. No native FP8. Still widely deployed."
|
| 187 |
+
notes_zh: "Ampere 架构。不原生支持 FP8,但部署量仍然非常大。"
|
| 188 |
+
|
| 189 |
+
- id: A100-40G
|
| 190 |
+
aliases: [A100-40, A100-SXM-40G]
|
| 191 |
+
memory_gb: 40
|
| 192 |
+
nvlink_bandwidth_gbps: 600
|
| 193 |
+
memory_bandwidth_gbps: 1555
|
| 194 |
+
fp16_tflops: 312
|
| 195 |
+
fp8_support: false
|
| 196 |
+
fp4_support: false
|
| 197 |
+
spec_source: "NVIDIA A100 40GB datasheet 2020"
|
| 198 |
+
notes_en: "Ampere 40 GB variant. Smaller HBM limits large-model single-node deployments."
|
| 199 |
+
notes_zh: "Ampere 的 40 GB 版本,显存较小,大模型单机部署受限。"
|
| 200 |
+
|
| 201 |
+
- id: A40
|
| 202 |
+
aliases: [A40-48G]
|
| 203 |
+
memory_gb: 48
|
| 204 |
+
nvlink_bandwidth_gbps: 112
|
| 205 |
+
memory_bandwidth_gbps: 696
|
| 206 |
+
fp16_tflops: 150
|
| 207 |
+
fp8_support: false
|
| 208 |
+
fp4_support: false
|
| 209 |
+
spec_source: "NVIDIA A40 datasheet 2020"
|
| 210 |
+
notes_en: "Ampere workstation. 48 GB with NVLink bridge (limited bandwidth). No FP8."
|
| 211 |
+
notes_zh: "Ampere 工作站卡,48 GB + NVLink 桥接(带宽较低)。不支持 FP8。"
|
| 212 |
+
|
| 213 |
+
- id: A10
|
| 214 |
+
aliases: [A10-24G]
|
| 215 |
+
memory_gb: 24
|
| 216 |
+
nvlink_bandwidth_gbps: 0
|
| 217 |
+
memory_bandwidth_gbps: 600
|
| 218 |
+
fp16_tflops: 125
|
| 219 |
+
fp8_support: false
|
| 220 |
+
fp4_support: false
|
| 221 |
+
spec_source: "NVIDIA A10 datasheet 2021"
|
| 222 |
+
notes_en: "Ampere inference card. 24 GB GDDR6. Widely used for low-cost inference in enterprise clouds."
|
| 223 |
+
notes_zh: "Ampere 推理卡,24 GB GDDR6。企业云低成本推理常用配置。"
|
| 224 |
+
|
| 225 |
+
- id: A10G
|
| 226 |
+
aliases: [A10G-24G]
|
| 227 |
+
memory_gb: 24
|
| 228 |
+
nvlink_bandwidth_gbps: 0
|
| 229 |
+
memory_bandwidth_gbps: 600
|
| 230 |
+
fp16_tflops: 125
|
| 231 |
+
fp8_support: false
|
| 232 |
+
fp4_support: false
|
| 233 |
+
spec_source: "NVIDIA A10G — AWS-specific variant of A10, g5 instances"
|
| 234 |
+
notes_en: "AWS-specific A10 variant. Same silicon as A10, deployed in g5 EC2 instances. No NVLink."
|
| 235 |
+
notes_zh: "AWS 定制版 A10,用于 g5 EC2 实例。核心规格与 A10 相同,无 NVLink。"
|
| 236 |
+
|
| 237 |
+
# ========================================================================
|
| 238 |
+
# NVIDIA Volta / Turing (older, still deployed)
|
| 239 |
+
# ========================================================================
|
| 240 |
+
- id: V100-SXM2-32G
|
| 241 |
+
aliases: [V100, V100-32G, V100-SXM2]
|
| 242 |
+
memory_gb: 32
|
| 243 |
+
nvlink_bandwidth_gbps: 300
|
| 244 |
+
memory_bandwidth_gbps: 900
|
| 245 |
+
fp16_tflops: 125
|
| 246 |
+
fp8_support: false
|
| 247 |
+
fp4_support: false
|
| 248 |
+
spec_source: "NVIDIA V100 SXM2 datasheet 2017"
|
| 249 |
+
notes_en: "Volta. No FP8. Still deployed in many existing clusters — works for smaller models, tight for 70B+."
|
| 250 |
+
notes_zh: "Volta 架构。不支持 FP8,但仍在大量老集群中服役。小模型够用,70B+ 紧张。"
|
| 251 |
+
|
| 252 |
+
- id: V100-PCIe-32G
|
| 253 |
+
aliases: [V100-PCIe, V100-PCI]
|
| 254 |
+
memory_gb: 32
|
| 255 |
+
nvlink_bandwidth_gbps: 0
|
| 256 |
+
memory_bandwidth_gbps: 900
|
| 257 |
+
fp16_tflops: 112
|
| 258 |
+
fp8_support: false
|
| 259 |
+
fp4_support: false
|
| 260 |
+
spec_source: "NVIDIA V100 PCIe datasheet 2017 — PCIe variant of V100, no NVLink."
|
| 261 |
+
notes_en: "PCIe version of V100. No NVLink, lower clocks than SXM2. Common in older servers."
|
| 262 |
+
notes_zh: "V100 的 PCIe 版本,无 NVLink,主频稍低。老服务器常见配置。"
|
| 263 |
+
|
| 264 |
+
- id: T4
|
| 265 |
+
aliases: [T4-16G]
|
| 266 |
+
memory_gb: 16
|
| 267 |
+
nvlink_bandwidth_gbps: 0
|
| 268 |
+
memory_bandwidth_gbps: 320
|
| 269 |
+
fp16_tflops: 65
|
| 270 |
+
fp8_support: false
|
| 271 |
+
fp4_support: false
|
| 272 |
+
spec_source: "NVIDIA T4 datasheet 2018"
|
| 273 |
+
notes_en: "Turing inference card. 16 GB, no NVLink, no FP8. Common as the cheapest cloud GPU option."
|
| 274 |
+
notes_zh: "Turing 推理卡。16 GB,无 NVLink,无 FP8。各云厂商最便宜的 GPU 选项之一。"
|
| 275 |
+
|
| 276 |
+
# ========================================================================
|
| 277 |
+
# AMD (ROCm, xGMI instead of NVLink)
|
| 278 |
+
# ========================================================================
|
| 279 |
+
- id: MI325X
|
| 280 |
+
aliases: [MI325X-256G, AMD-MI325X]
|
| 281 |
+
memory_gb: 256
|
| 282 |
+
nvlink_bandwidth_gbps: 896
|
| 283 |
+
memory_bandwidth_gbps: 6000
|
| 284 |
+
fp16_tflops: 1307
|
| 285 |
+
fp8_support: true
|
| 286 |
+
fp4_support: false
|
| 287 |
+
spec_source: "AMD Instinct MI325X datasheet 2024 — 256 GB HBM3E, 6 TB/s bandwidth, 1000W TDP, CDNA 3."
|
| 288 |
+
notes_en: "AMD flagship 2024. 256 GB HBM3E (largest single-card memory in v0.1 database). Upgraded MI300X with faster HBM3E and more capacity. Dense FP16 1307 TFLOPS, FP8 2615 TFLOPS. 1000W TDP, OAM format. ROCm software stack."
|
| 289 |
+
notes_zh: "AMD 2024 年旗舰。256 GB HBM3E(v0.1 数据库中单卡最大)。MI300X 升级版,HBM3E 更快、容量更大。Dense FP16 1307 TFLOPS,FP8 2615 TFLOPS。1000W TDP,OAM 形态。需要 ROCm 软件栈。"
|
| 290 |
+
|
| 291 |
+
- id: MI300X
|
| 292 |
+
aliases: [MI300X-192G, AMD-MI300X]
|
| 293 |
+
memory_gb: 192
|
| 294 |
+
nvlink_bandwidth_gbps: 896
|
| 295 |
+
memory_bandwidth_gbps: 5300
|
| 296 |
+
fp16_tflops: 1307
|
| 297 |
+
fp8_support: true
|
| 298 |
+
fp4_support: false
|
| 299 |
+
spec_source: "AMD Instinct MI300X datasheet 2023-12"
|
| 300 |
+
notes_en: "AMD flagship 2023. 192 GB HBM3. xGMI 896 GB/s (like NVLink). Software stack: ROCm + vLLM. Support for DeepSeek V4 etc. lags Nvidia by weeks."
|
| 301 |
+
notes_zh: "AMD 2023 年旗舰。192 GB HBM3。xGMI 互联 896 GB/s(类 NVLink)。需要 ROCm + vLLM 栈。新模型支持通常比 NVIDIA 晚几周。"
|
| 302 |
+
|
| 303 |
+
- id: MI250X
|
| 304 |
+
aliases: [MI250X-128G, AMD-MI250X]
|
| 305 |
+
memory_gb: 128
|
| 306 |
+
nvlink_bandwidth_gbps: 800
|
| 307 |
+
memory_bandwidth_gbps: 3280
|
| 308 |
+
fp16_tflops: 383
|
| 309 |
+
fp8_support: false
|
| 310 |
+
fp4_support: false
|
| 311 |
+
spec_source: "AMD Instinct MI250X datasheet 2022"
|
| 312 |
+
notes_en: "AMD previous-gen. 128 GB HBM2e. No FP8. Deployed in some HPC clusters (Frontier)."
|
| 313 |
+
notes_zh: "AMD 上代数据中心卡。128 GB HBM2e,不支持 FP8。少数 HPC 集群(如 Frontier 超算)有部署。"
|
| 314 |
+
|
| 315 |
+
- id: MI210
|
| 316 |
+
aliases: [MI210-64G, AMD-MI210]
|
| 317 |
+
memory_gb: 64
|
| 318 |
+
nvlink_bandwidth_gbps: 300
|
| 319 |
+
memory_bandwidth_gbps: 1600
|
| 320 |
+
fp16_tflops: 181
|
| 321 |
+
fp8_support: false
|
| 322 |
+
fp4_support: false
|
| 323 |
+
spec_source: "AMD Instinct MI210 datasheet 2022 — CDNA 2, single-die version of MI250. 64 GB HBM2e."
|
| 324 |
+
notes_en: "AMD CDNA 2 single-die. 64 GB HBM2e, 1.6 TB/s. No FP8 (CDNA 2 limitation). Common as entry-level AMD datacenter card."
|
| 325 |
+
notes_zh: "AMD CDNA 2 单 die 版本,64 GB HBM2e,1.6 TB/s 带宽。不支持 FP8(CDNA 2 限制)。AMD 入门数据中心卡常见配置。"
|
| 326 |
+
|
| 327 |
+
# ========================================================================
|
| 328 |
+
# Intel Habana Gaudi
|
| 329 |
+
# ========================================================================
|
| 330 |
+
- id: Gaudi3
|
| 331 |
+
aliases: [Gaudi-3, Habana-Gaudi3]
|
| 332 |
+
memory_gb: 128
|
| 333 |
+
nvlink_bandwidth_gbps: 1200
|
| 334 |
+
memory_bandwidth_gbps: 3700
|
| 335 |
+
fp16_tflops: 1835
|
| 336 |
+
fp8_support: true
|
| 337 |
+
fp4_support: false
|
| 338 |
+
spec_source: "Intel Gaudi 3 datasheet 2024"
|
| 339 |
+
notes_en: "Intel Habana Gaudi 3. 128 GB HBM2e. FP8 support. Software stack: SynapseAI (not CUDA). vLLM support via Intel fork."
|
| 340 |
+
notes_zh: "Intel Habana Gaudi 3。128 GB HBM2e,支持 FP8。软件栈为 SynapseAI(非 CUDA)。vLLM 需走 Intel 分支。"
|
| 341 |
+
|
| 342 |
+
- id: Gaudi2
|
| 343 |
+
aliases: [Gaudi-2, Habana-Gaudi2]
|
| 344 |
+
memory_gb: 96
|
| 345 |
+
nvlink_bandwidth_gbps: 2400
|
| 346 |
+
memory_bandwidth_gbps: 2450
|
| 347 |
+
fp16_tflops: 432
|
| 348 |
+
fp8_support: true
|
| 349 |
+
fp4_support: false
|
| 350 |
+
spec_source: "Intel Gaudi 2 datasheet 2022"
|
| 351 |
+
notes_en: "Intel Habana Gaudi 2. 96 GB HBM2e with 24x100GbE on-board (used for scale-out). FP8 support."
|
| 352 |
+
notes_zh: "Intel Habana Gaudi 2。96 GB HBM2e,板载 24 个 100GbE(用于横向扩展)。支持 FP8。"
|
| 353 |
+
|
| 354 |
+
# ========================================================================
|
| 355 |
+
# Huawei Ascend
|
| 356 |
+
# ========================================================================
|
| 357 |
+
# The 910B "series" is actually a set of sub-variants (B1/B2/B3/B4) with
|
| 358 |
+
# different compute tiers and memory sizes. `910B` as a plain id resolves
|
| 359 |
+
# to 910B3 (the most common training configuration).
|
| 360 |
+
- id: "910A"
|
| 361 |
+
aliases: [Ascend-910A]
|
| 362 |
+
memory_gb: 32
|
| 363 |
+
nvlink_bandwidth_gbps: 400
|
| 364 |
+
memory_bandwidth_gbps: 1200
|
| 365 |
+
fp16_tflops: 256
|
| 366 |
+
fp8_support: false
|
| 367 |
+
fp4_support: false
|
| 368 |
+
spec_source: "Ascend 910 (1st gen) — 7nm, 32 GB HBM. Community-compiled spec."
|
| 369 |
+
notes_en: "Huawei Ascend 910 (1st gen, 2019). Predecessor to 910B. Still deployed in many older clusters. HCCS interconnect."
|
| 370 |
+
notes_zh: "华为昇腾 910 第一代(2019 年),910B 的前身。很多老集群仍在使用。HCCS 互联。"
|
| 371 |
+
|
| 372 |
+
- id: "910B1"
|
| 373 |
+
aliases: [Ascend-910B1]
|
| 374 |
+
memory_gb: 64
|
| 375 |
+
nvlink_bandwidth_gbps: 400
|
| 376 |
+
memory_bandwidth_gbps: 1600
|
| 377 |
+
fp16_tflops: 414
|
| 378 |
+
fp8_support: false
|
| 379 |
+
fp4_support: false
|
| 380 |
+
spec_source: "Ascend 910B1 — training variant, Atlas 800T A2. Commonly cited as top-tier 910B sub-variant; TSMC 7nm process."
|
| 381 |
+
notes_en: "Top-tier 910B training variant. 64 GB HBM2, 414 TFLOPS FP16. Used in Atlas 800T A2 training servers. No native FP8."
|
| 382 |
+
notes_zh: "910B 系列顶配训练版本。64 GB HBM2,FP16 算力 414 TFLOPS。搭载于 Atlas 800T A2 训练服务器。不原生支持 FP8。"
|
| 383 |
+
|
| 384 |
+
- id: "910B2"
|
| 385 |
+
aliases: [Ascend-910B2]
|
| 386 |
+
memory_gb: 64
|
| 387 |
+
nvlink_bandwidth_gbps: 400
|
| 388 |
+
memory_bandwidth_gbps: 1600
|
| 389 |
+
fp16_tflops: 376
|
| 390 |
+
fp8_support: false
|
| 391 |
+
fp4_support: false
|
| 392 |
+
spec_source: "Ascend 910B2 — training variant, commonly cited as standard 910B training configuration."
|
| 393 |
+
notes_en: "Standard 910B training variant. 64 GB HBM2, 376 TFLOPS FP16. General-purpose training server baseline."
|
| 394 |
+
notes_zh: "910B 常规训练版本。64 GB HBM2,FP16 算力 376 TFLOPS。通用训练服务器标准配置。"
|
| 395 |
+
|
| 396 |
+
- id: "910B3"
|
| 397 |
+
aliases: [Ascend-910B3, "910B", Ascend-910B]
|
| 398 |
+
memory_gb: 64
|
| 399 |
+
nvlink_bandwidth_gbps: 400
|
| 400 |
+
memory_bandwidth_gbps: 1600
|
| 401 |
+
fp16_tflops: 313
|
| 402 |
+
fp8_support: false
|
| 403 |
+
fp4_support: false
|
| 404 |
+
spec_source: "Ascend 910B3 — training variant, SMIC-produced per industry reports. (aliased as bare `910B` for convenience)"
|
| 405 |
+
notes_en: "910B3 training variant, 313 TFLOPS FP16. Believed to be SMIC-produced (vs TSMC for B1/B2). The `910B` bare name resolves here since B3 is the most commonly referenced."
|
| 406 |
+
notes_zh: "910B3 训练版本,FP16 算力 313 TFLOPS。业界普遍认为由中芯国际生产(B1/B2 据传为台积电)。裸写 `910B` 时默认解析到此条目(最常被引用)。"
|
| 407 |
+
|
| 408 |
+
- id: "910B4"
|
| 409 |
+
aliases: [Ascend-910B4]
|
| 410 |
+
memory_gb: 32
|
| 411 |
+
nvlink_bandwidth_gbps: 400
|
| 412 |
+
memory_bandwidth_gbps: 1600
|
| 413 |
+
fp16_tflops: 280
|
| 414 |
+
fp8_support: false
|
| 415 |
+
fp4_support: false
|
| 416 |
+
spec_source: "Ascend 910B4 — inference variant, 32 GB HBM (half of B1/B2/B3). Atlas 800I A2 inference server."
|
| 417 |
+
notes_en: "910B4 is the inference-oriented 910B variant. 32 GB HBM (half of training variants), 280 TFLOPS FP16. Deployed in Atlas 800I A2 inference servers."
|
| 418 |
+
notes_zh: "910B4 是 910B 系列的推理版本。32 GB HBM(训练版本的一半),FP16 算力 280 TFLOPS。搭载于 Atlas 800I A2 推理服务器。"
|
| 419 |
+
|
| 420 |
+
- id: "910C"
|
| 421 |
+
aliases: [Ascend-910C]
|
| 422 |
+
memory_gb: 64
|
| 423 |
+
nvlink_bandwidth_gbps: 400
|
| 424 |
+
memory_bandwidth_gbps: 3200
|
| 425 |
+
fp16_tflops: 780
|
| 426 |
+
fp8_support: false
|
| 427 |
+
fp4_support: false
|
| 428 |
+
spec_source: "Huawei Ascend 910C — launched 2024, commonly cited specs pending official datasheet"
|
| 429 |
+
notes_en: "Huawei Ascend 910C (2024). Roughly 2x compute vs 910B at similar memory. FP8 support status unclear — check CANN version notes. Software ecosystem matures but still behind NVIDIA."
|
| 430 |
+
notes_zh: "华为昇腾 910C(2024 年)。算力大约是 910B 的两倍,显存相当。FP8 支持情况需看 CANN 版本。软件生态持续完善但仍落后于 NVIDIA。"
|
| 431 |
+
|
| 432 |
+
- id: Atlas-300I-Duo
|
| 433 |
+
aliases: [Atlas300IDuo, 300I-Duo]
|
| 434 |
+
memory_gb: 48
|
| 435 |
+
nvlink_bandwidth_gbps: 0
|
| 436 |
+
memory_bandwidth_gbps: 204
|
| 437 |
+
fp16_tflops: 140
|
| 438 |
+
fp8_support: false
|
| 439 |
+
fp4_support: false
|
| 440 |
+
spec_source: "Huawei Atlas 300I Duo inference card — 2x Ascend 310P per card. 140 TFLOPS FP16 per card, 48 GB LPDDR4X."
|
| 441 |
+
notes_en: "Huawei Atlas 300I Duo inference card: 2x Ascend 310P with combined 48 GB LPDDR4X (96 GB variant available). 280 TOPS INT8. LPDDR4X gives 204 GB/s total bandwidth — much lower than HBM-based cards. PCIe-only, no NVLink. Best for cost-sensitive inference."
|
| 442 |
+
notes_zh: "华为 Atlas 300I Duo 推理卡:双 Ascend 310P,合计 48 GB LPDDR4X(另有 96 GB 版本)。INT8 280 TOPS。显存是 LPDDR4X,带宽 204 GB/s,远低于 HBM 卡。仅 PCIe,无 NVLink。主要面向成本敏感的推理场景。"
|
| 443 |
+
|
| 444 |
+
# ========================================================================
|
| 445 |
+
# Chinese domestic AI accelerators (non-NVIDIA / non-AMD)
|
| 446 |
+
# ========================================================================
|
| 447 |
+
- id: MXC500
|
| 448 |
+
aliases: [MetaX-MXC500, XiYun-C500, 曦云C500]
|
| 449 |
+
memory_gb: 64
|
| 450 |
+
nvlink_bandwidth_gbps: 800
|
| 451 |
+
memory_bandwidth_gbps: 1800
|
| 452 |
+
fp16_tflops: 240
|
| 453 |
+
fp8_support: false
|
| 454 |
+
fp4_support: false
|
| 455 |
+
spec_source: "MetaX 沐曦 MXC500 / 曦云 C500 (PCIe variant, 350W). OAM variant has 280 TFLOPS FP16 @ 450W. 64 GB HBM2e, 1.8 TB/s memory bandwidth, MetaXLink interconnect."
|
| 456 |
+
notes_en: "MetaX (沐曦) MXC500. 7nm, CUDA-compatible via MXMACA stack. PCIe variant: 240 TFLOPS FP16, 350W. OAM variant: 280 TFLOPS FP16, 450W. Targets A100-class workloads. No native FP8."
|
| 457 |
+
notes_zh: "沐曦曦云 C500。7nm 工艺,通过 MXMACA 软件栈兼容 CUDA。PCIe 版本 FP16 240 TFLOPS / 350W,OAM 版本 280 TFLOPS / 450W。对标 A100 场景。不原生支持 FP8。"
|
| 458 |
+
|
| 459 |
+
- id: MXC550
|
| 460 |
+
aliases: [MetaX-MXC550, XiYun-C550, 曦云C550]
|
| 461 |
+
memory_gb: 64
|
| 462 |
+
nvlink_bandwidth_gbps: 896
|
| 463 |
+
memory_bandwidth_gbps: 1600
|
| 464 |
+
fp16_tflops: 240
|
| 465 |
+
fp8_support: false
|
| 466 |
+
fp4_support: false
|
| 467 |
+
spec_source: "MetaX 沐曦 MXC550 / 曦云 C550 (OAM, 2024). Partial specs from third-party comparison docs; full datasheet TBD. 8-card fabric bandwidth 896 GB/s."
|
| 468 |
+
notes_en: "MetaX (沐曦) MXC550 — 2024 OAM-format flagship. Supports OAM 1.5 + 2.0. 8-card fabric bandwidth 896 GB/s. Full specs pending official datasheet — figures here are from third-party comparison articles."
|
| 469 |
+
notes_zh: "沐曦曦云 C550 — 2024 年 OAM 形态旗舰。支持 OAM 1.5 + 2.0 规范。八卡全互联带宽 896 GB/s。完整规格待官方数据表披露,此处数字来自第三方对比资料。"
|
| 470 |
+
|
| 471 |
+
- id: Kunlun-P800
|
| 472 |
+
aliases: [KunlunXin-P800, 昆仑芯P800, Kunlun-Gen3]
|
| 473 |
+
memory_gb: 96
|
| 474 |
+
nvlink_bandwidth_gbps: 400
|
| 475 |
+
memory_bandwidth_gbps: 2000
|
| 476 |
+
fp16_tflops: 345
|
| 477 |
+
fp8_support: true
|
| 478 |
+
fp4_support: false
|
| 479 |
+
spec_source: "KunlunXin P800 (3rd gen, 2024). 96 GB HBM3 (largest among Chinese domestic AI chips). Baidu Cloud uses P800 for first-party inference. Specs partially inferred from public Baidu announcements; official datasheet limited distribution."
|
| 480 |
+
notes_en: "Baidu KunlunXin P800 — 3rd gen, 2024. 96 GB HBM3. Reported to support 8-bit inference and MoE optimizations. Baidu's internal clusters run Kunlun P800 at 10k+ card scale. Figures here are from public Baidu materials; official spec sheet not fully public."
|
| 481 |
+
notes_zh: "百度昆仑芯 P800 — 第三代,2024 年。96 GB HBM3(国产 AI 芯片中显存最大之一)。报告支持 8bit 推理和 MoE 优化。百度内部 1 万卡以上规模部署。数字来自百度公开资料,完整规格表未完全披露。"
|
| 482 |
+
|
| 483 |
+
- id: Kunlun-R200
|
| 484 |
+
aliases: [KunlunXin-R200, 昆仑芯R200, Kunlun-Gen2]
|
| 485 |
+
memory_gb: 32
|
| 486 |
+
nvlink_bandwidth_gbps: 200
|
| 487 |
+
memory_bandwidth_gbps: 512
|
| 488 |
+
fp16_tflops: 128
|
| 489 |
+
fp8_support: false
|
| 490 |
+
fp4_support: false
|
| 491 |
+
spec_source: "KunlunXin R200 (2nd gen, 2021). 7nm XPU architecture. FP16 128 TFLOPS / INT8 256 TOPS."
|
| 492 |
+
notes_en: "Baidu KunlunXin R200 — 2nd gen, 7nm. FP16 128 TFLOPS, INT8 256 TOPS. XPU architecture. PCIe 4.0 + XCCL interconnect. No FP8."
|
| 493 |
+
notes_zh: "百度昆仑芯 R200 — 第二代,7nm XPU 架构。FP16 128 TFLOPS,INT8 256 TOPS。PCIe 4.0 + 昆仑芯互联 XCCL。无 FP8。"
|
| 494 |
+
|
| 495 |
+
- id: BR100
|
| 496 |
+
aliases: [Biren-BR100, 壁仞BR100, 壁砺100]
|
| 497 |
+
memory_gb: 64
|
| 498 |
+
nvlink_bandwidth_gbps: 512
|
| 499 |
+
memory_bandwidth_gbps: 1640
|
| 500 |
+
fp16_tflops: 1024
|
| 501 |
+
fp8_support: false
|
| 502 |
+
fp4_support: false
|
| 503 |
+
spec_source: "Biren 壁仞 BR100 (OAM, 550W). 7nm Chiplet, 77B transistors. BF16/FP16 1024 TFLOPS, INT8 2048 TOPS, 64 GB HBM2e 1.64 TB/s. BLINK 512 GB/s 8-card fabric."
|
| 504 |
+
notes_en: "Biren BR100 (壁仞) — 2022 flagship. OAM format, 550W. 1024 TFLOPS BF16/FP16 (PFLOPS class), 64 GB HBM2e. BLINK interconnect 512 GB/s (8-card fabric). No FP8. US export-restricted since 2022 — production status uncertain."
|
| 505 |
+
notes_zh: "壁仞 BR100 — 2022 年旗舰 OAM 卡,550W。BF16/FP16 1024 TFLOPS(PFLOPS 级),64 GB HBM2e。BLINK 互联 512 GB/s(8 卡全互联)。无 FP8。2022 年被美国出口管制,后续量产状态不明。"
|
| 506 |
+
|
| 507 |
+
- id: BR104
|
| 508 |
+
aliases: [Biren-BR104, 壁仞BR104, 壁砺104]
|
| 509 |
+
memory_gb: 32
|
| 510 |
+
nvlink_bandwidth_gbps: 128
|
| 511 |
+
memory_bandwidth_gbps: 820
|
| 512 |
+
fp16_tflops: 512
|
| 513 |
+
fp8_support: false
|
| 514 |
+
fp4_support: false
|
| 515 |
+
spec_source: "Biren 壁仞 BR104 (PCIe, 300W). Single-die version of BR100 with halved specs. BF16/FP16 512 TFLOPS, 32 GB HBM2e. Won MLPerf Inference ResNet50 and BERT single-card top-1 in its class."
|
| 516 |
+
notes_en: "Biren BR104 — PCIe single-die version of BR100. 300W, 512 TFLOPS BF16/FP16, 32 GB HBM2e. Won MLPerf Inference BERT (1.58x A100 in server mode). No FP8. Export-restricted."
|
| 517 |
+
notes_zh: "壁仞 BR104 — BR100 的单 die PCIe 版本。300W,BF16/FP16 512 TFLOPS,32 GB HBM2e。MLPerf Inference BERT 测试 server 模式性能达 A100 的 1.58 倍。无 FP8。已被出口管制。"
|
| 518 |
+
|
| 519 |
+
- id: BI-V100
|
| 520 |
+
aliases: [Iluvatar-BI-V100, 天数天垓100, TianGai-100]
|
| 521 |
+
memory_gb: 32
|
| 522 |
+
nvlink_bandwidth_gbps: 64
|
| 523 |
+
memory_bandwidth_gbps: 1200
|
| 524 |
+
fp16_tflops: 147
|
| 525 |
+
fp8_support: false
|
| 526 |
+
fp4_support: false
|
| 527 |
+
spec_source: "Iluvatar CoreX 天数智芯 BI-V100 (天垓100). 7nm, SIMT, 24B transistors, 2.5D CoWoS packaging. FP16 147 TFLOPS / INT8 295 TOPS. 32 GB HBM2, 1.2 TB/s bandwidth. PCIe 4.0 x16, 250W TDP."
|
| 528 |
+
notes_en: "Iluvatar (天数智芯) BI-V100 — training/general-purpose. 7nm SIMT architecture, 32 GB HBM2, 1.2 TB/s memory bandwidth. FP16 147 TFLOPS, INT8 295 TOPS. 250W TDP. Interconnect bandwidth per card is modest (~64 GB/s shared)."
|
| 529 |
+
notes_zh: "天数智芯 BI-V100(天垓100)— 训练/通用 GPU。7nm SIMT 架构,32 GB HBM2,1.2 TB/s 显存带宽。FP16 147 TFLOPS,INT8 295 TOPS。250W TDP。单卡互联带宽 ~64 GB/s,相对较低。"
|
| 530 |
+
|
| 531 |
+
- id: MR-V100
|
| 532 |
+
aliases: [Iluvatar-MR-V100, 天数智铠100, ZhiKai-100]
|
| 533 |
+
memory_gb: 32
|
| 534 |
+
nvlink_bandwidth_gbps: 0
|
| 535 |
+
memory_bandwidth_gbps: 1200
|
| 536 |
+
fp16_tflops: 100
|
| 537 |
+
fp8_support: false
|
| 538 |
+
fp4_support: false
|
| 539 |
+
spec_source: "Iluvatar CoreX 天数智芯 智铠100 (MR-V100) 2022. Inference card, 32 GB HBM2E, ~200 TFLOPS BF16/FP16-low-precision-aggregated, 128-channel 1080p video decode, 150W TDP."
|
| 540 |
+
notes_en: "Iluvatar inference card (智铠100). 32 GB HBM2E. 150W TDP. Primarily inference-focused — mixed-precision aggregated throughput ~200 TFLOPS."
|
| 541 |
+
notes_zh: "天数智芯智铠100 推理卡。32 GB HBM2E,150W TDP。主要面向推理场景,混合精度聚合算力约 200 TFLOPS。"
|
| 542 |
+
|
| 543 |
+
- id: MLU370-X8
|
| 544 |
+
aliases: [Cambricon-MLU370-X8, 寒武纪MLU370-X8, 思元370-X8]
|
| 545 |
+
memory_gb: 48
|
| 546 |
+
nvlink_bandwidth_gbps: 200
|
| 547 |
+
memory_bandwidth_gbps: 614
|
| 548 |
+
fp16_tflops: 48
|
| 549 |
+
fp8_support: false
|
| 550 |
+
fp4_support: false
|
| 551 |
+
spec_source: "Cambricon 寒武纪 MLU370-X8 (dual MLU370 chiplet, 250W). 48 GB LPDDR5, INT8 256 TOPS, FP32 24 TFLOPS (FP16 ~48 TFLOPS estimated, official not given). MLU-Link 200 GB/s."
|
| 552 |
+
notes_en: "Cambricon (寒武纪) MLU370-X8 — dual-chip package, 250W. 48 GB LPDDR5 (not HBM), INT8 256 TOPS, FP32 24 TFLOPS. MLU-Link 200 GB/s for 8-card setups. LPDDR5 means lower memory bandwidth than HBM cards."
|
| 553 |
+
notes_zh: "寒武纪 MLU370-X8 — 双芯粒封装,250W。48 GB LPDDR5(非 HBM),INT8 256 TOPS,FP32 24 TFLOPS。MLU-Link 200 GB/s,支持 8 卡部署。LPDDR5 意味着显存带宽低于 HBM 卡。"
|
| 554 |
+
|
| 555 |
+
- id: MLU590
|
| 556 |
+
aliases: [Cambricon-MLU590, 寒武纪MLU590, 思元590]
|
| 557 |
+
memory_gb: 80
|
| 558 |
+
nvlink_bandwidth_gbps: 372
|
| 559 |
+
memory_bandwidth_gbps: 2000
|
| 560 |
+
fp16_tflops: 314
|
| 561 |
+
fp8_support: false
|
| 562 |
+
fp4_support: false
|
| 563 |
+
spec_source: "Cambricon 寒武纪 思元590 (MLU590) — 7nm, MLUv02/MLUarch05. 80 GB HBM (likely HBM2e based on 2 TB/s bandwidth), FP16 314 TFLOPS, FP32 80 TFLOPS, MLU-Link 372 GB/s. Used at Baidu ERNIE (文心一言) project."
|
| 564 |
+
notes_en: "Cambricon (寒武纪) MLU590 — flagship AI training chip. 80 GB HBM, 2 TB/s memory bandwidth. FP16 314 TFLOPS (dense). MLU-Link 372 GB/s 8-card fabric. Comparable FP16 compute to NVIDIA A100 level. No FP8. Production volume and ecosystem still maturing."
|
| 565 |
+
notes_zh: "寒武纪思元590 — 旗舰 AI 训练芯片。80 GB HBM,2 TB/s 显存带宽。FP16 314 TFLOPS(dense),综合性能约为 A100 级别。MLU-Link 372 GB/s 八卡互联。无 FP8。量产规模和生态仍在成熟。"
|
| 566 |
+
|
| 567 |
+
- id: Hygon-K100-AI
|
| 568 |
+
aliases: [K100-AI, 海光K100AI, DCU-K100-AI]
|
| 569 |
+
memory_gb: 64
|
| 570 |
+
nvlink_bandwidth_gbps: 184
|
| 571 |
+
memory_bandwidth_gbps: 896
|
| 572 |
+
fp16_tflops: 192
|
| 573 |
+
fp8_support: false
|
| 574 |
+
fp4_support: false
|
| 575 |
+
spec_source: "Hygon 海光 K100 AI — DCU architecture (GPGPU+AI hybrid), 64 GB HBM, 896 GB/s memory bandwidth, 350W TDP. FP16 192 TFLOPS dense (some sources cite 256 TFLOPS but values vary). xGMI 184 GB/s."
|
| 576 |
+
notes_en: "Hygon (海光) K100 AI — DCU series. 64 GB HBM, 896 GB/s bandwidth. FP16 192 TFLOPS (industry reports vary 100-256 TFLOPS depending on compute unit/mode). ROCm-compatible, can leverage AMD software ecosystem. Positioned against A800 for Chinese market. 350W TDP."
|
| 577 |
+
notes_zh: "海光 K100 AI — DCU 系列。64 GB HBM,896 GB/s 带宽。FP16 192 TFLOPS(公开资料数字因计算单元和精度模式不同有 100-256 TFLOPS 差异)。兼容 ROCm,可复用 AMD 软件生态。面向国产 A800 替代场景。350W TDP。"
|
| 578 |
+
|
| 579 |
+
- id: Hygon-Z100
|
| 580 |
+
aliases: [Z100, 海光Z100, DCU-Z100, 深算二号]
|
| 581 |
+
memory_gb: 32
|
| 582 |
+
nvlink_bandwidth_gbps: 184
|
| 583 |
+
memory_bandwidth_gbps: 1000
|
| 584 |
+
fp16_tflops: 180
|
| 585 |
+
fp8_support: false
|
| 586 |
+
fp4_support: false
|
| 587 |
+
spec_source: "Hygon 海光 DCU Z100 (深算二号) — 32 GB HBM2, 1 TB/s bandwidth, 8192 compute cores, FP32 90 TFLOPS, FP16 ~180 TFLOPS (2x FP32), FP64 10.8 TFLOPS. xGMI 184 GB/s. Performance reported as 80-90% of A100. 350W TDP."
|
| 588 |
+
notes_en: "Hygon (海光) DCU Z100 / 深算二号. 32 GB HBM2, 1 TB/s bandwidth, 8192 compute units. FP16 180 TFLOPS, FP32 90 TFLOPS, FP64 10.8 TFLOPS. 350W. Performance cited at 80-90% of A100. ROCm stack, PCIe Gen4 + xGMI multi-card."
|
| 589 |
+
notes_zh: "海光 DCU Z100(深算二号)。32 GB HBM2,1 TB/s 带宽,8192 计算单元。FP16 180 TFLOPS,FP32 90 TFLOPS,FP64 10.8 TFLOPS。350W。综合性能约为 A100 的 80-90%。基于 ROCm 栈,PCIe Gen4 + xGMI 多卡互联。"
|
| 590 |
+
|
| 591 |
+
- id: MTT-S4000
|
| 592 |
+
aliases: [MooreThreads-S4000, 摩尔线程S4000, MTT-S4000-48G]
|
| 593 |
+
memory_gb: 48
|
| 594 |
+
nvlink_bandwidth_gbps: 240
|
| 595 |
+
memory_bandwidth_gbps: 768
|
| 596 |
+
fp16_tflops: 100
|
| 597 |
+
fp8_support: false
|
| 598 |
+
fp4_support: false
|
| 599 |
+
spec_source: "Moore Threads MTT S4000 datasheet 2023 — 3rd-gen MUSA (曲院). 48 GB GDDR6, 768 GB/s bandwidth. FP16/BF16 100 TFLOPS, INT8 200 TOPS. MTLink 1.0 240 GB/s."
|
| 600 |
+
notes_en: "Moore Threads (摩尔线程) S4000 — domestic AI training card. 48 GB GDDR6 (not HBM), 768 GB/s. FP16/BF16 100 TFLOPS. MTLink 1.0 240 GB/s. CUDA compatibility via MUSA translation."
|
| 601 |
+
notes_zh: "摩尔线程 S4000 — 国产训推加速卡。48 GB GDDR6(非 HBM),768 GB/s 带宽。FP16/BF16 100 TFLOPS。MTLink 1.0 互联 240 GB/s。通过 MUSA 兼容 CUDA 生态。"
|
| 602 |
+
|
| 603 |
+
- id: MTT-S3000
|
| 604 |
+
aliases: [MooreThreads-S3000, 摩尔线程S3000]
|
| 605 |
+
memory_gb: 32
|
| 606 |
+
nvlink_bandwidth_gbps: 0
|
| 607 |
+
memory_bandwidth_gbps: 448
|
| 608 |
+
fp16_tflops: 30
|
| 609 |
+
fp8_support: false
|
| 610 |
+
fp4_support: false
|
| 611 |
+
spec_source: "Moore Threads MTT S3000 — MUSA 春晓 architecture. 32 GB GDDR6, 448 GB/s. FP32 ~15.2 TFLOPS inferred from S4000 comparison (S4000 is 64%+ higher); FP16 ~30 TFLOPS estimate (datasheet not fully public)."
|
| 612 |
+
notes_en: "Moore Threads (摩尔线程) S3000 — predecessor to S4000. 32 GB GDDR6, 448 GB/s. FP16 specs not fully published; estimated ~30 TFLOPS based on S4000 comparison. Multi-purpose server GPU, also supports rendering."
|
| 613 |
+
notes_zh: "摩尔线程 S3000 — S4000 的前代。32 GB GDDR6,448 GB/s。FP16 官方未完全披露,基于 S4000 对比推算约 30 TFLOPS。通用服务器 GPU,兼顾渲染场景。"
|
src/llm_cal/hardware/loader.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Hardware database loader + lookup."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from functools import lru_cache
|
| 6 |
+
from importlib.resources import files
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Literal
|
| 9 |
+
|
| 10 |
+
from pydantic import BaseModel, Field
|
| 11 |
+
|
| 12 |
+
from llm_cal.common.yaml_loader import load_yaml
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class GPUSpec(BaseModel):
|
| 16 |
+
"""One GPU entry in the hardware database."""
|
| 17 |
+
|
| 18 |
+
id: str
|
| 19 |
+
aliases: list[str] = Field(default_factory=list)
|
| 20 |
+
memory_gb: int
|
| 21 |
+
nvlink_bandwidth_gbps: int
|
| 22 |
+
# HBM/GDDR memory bandwidth (NOT NVLink). This is the critical number for
|
| 23 |
+
# decode throughput: decode is memory-bandwidth-bound, and per-token
|
| 24 |
+
# latency = active_weight_bytes / (memory_bandwidth × utilization).
|
| 25 |
+
# 0 or None means unknown (performance module will skip bandwidth checks).
|
| 26 |
+
memory_bandwidth_gbps: int | None = None
|
| 27 |
+
fp16_tflops: float
|
| 28 |
+
fp8_support: bool
|
| 29 |
+
fp4_support: bool
|
| 30 |
+
notes_en: str | None = None
|
| 31 |
+
notes_zh: str | None = None
|
| 32 |
+
# Where the numeric specs came from. A URL to a vendor datasheet / trusted
|
| 33 |
+
# benchmark, or a short note like "NVIDIA H100 datasheet 2024-Q3". Lets
|
| 34 |
+
# users audit the source; honesty-over-convenience principle.
|
| 35 |
+
spec_source: str | None = None
|
| 36 |
+
|
| 37 |
+
def localized_notes(self, locale: Literal["en", "zh"]) -> str | None:
|
| 38 |
+
if locale == "zh":
|
| 39 |
+
return self.notes_zh or self.notes_en
|
| 40 |
+
return self.notes_en or self.notes_zh
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class GPUDatabase(BaseModel):
|
| 44 |
+
schema_version: int
|
| 45 |
+
gpus: list[GPUSpec]
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class UnknownGPUError(Exception):
|
| 49 |
+
"""User asked for a GPU id we don't know."""
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def _default_path() -> Path:
|
| 53 |
+
"""Locate the bundled gpu_database.yaml inside the installed package."""
|
| 54 |
+
return Path(str(files("llm_cal.hardware").joinpath("gpu_database.yaml")))
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
@lru_cache(maxsize=1)
|
| 58 |
+
def load_database(path: Path | None = None) -> GPUDatabase:
|
| 59 |
+
return load_yaml(path or _default_path(), GPUDatabase)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def lookup(gpu: str, db: GPUDatabase | None = None) -> GPUSpec:
|
| 63 |
+
"""Look up a GPU by id or alias. Case-insensitive."""
|
| 64 |
+
database = db or load_database()
|
| 65 |
+
target = gpu.strip().upper()
|
| 66 |
+
for spec in database.gpus:
|
| 67 |
+
if spec.id.upper() == target:
|
| 68 |
+
return spec
|
| 69 |
+
if any(alias.upper() == target for alias in spec.aliases):
|
| 70 |
+
return spec
|
| 71 |
+
# Helpful rejection
|
| 72 |
+
if "X" in target and target.split("X")[-1].isdigit():
|
| 73 |
+
raise UnknownGPUError(
|
| 74 |
+
f"'{gpu}' looks like old 'H800x8' format. "
|
| 75 |
+
f"Use `--gpu {target.split('X')[0]} --gpu-count {target.split('X')[-1]}` instead."
|
| 76 |
+
)
|
| 77 |
+
raise UnknownGPUError(f"Unknown GPU '{gpu}'. Known: {', '.join(s.id for s in database.gpus)}")
|
src/llm_cal/llm_review/__init__.py
ADDED
|
File without changes
|
src/llm_cal/llm_review/reviewer.py
ADDED
|
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Optional LLM-based second opinion on the tool's derivation trace.
|
| 2 |
+
|
| 3 |
+
Design constraints (from the tool's honesty principle):
|
| 4 |
+
1. Never overrides the 6 primary labels. LLM responses are tagged
|
| 5 |
+
[llm-opinion] — a distinct 7th label.
|
| 6 |
+
2. Opt-in only — requires --llm-review flag AND env vars set.
|
| 7 |
+
3. Non-fatal — if the API call fails, the main report still works.
|
| 8 |
+
4. User-chosen provider — supports any OpenAI-compatible endpoint
|
| 9 |
+
(OpenAI, DeepSeek, Moonshot, Zhipu, local vLLM, etc.)
|
| 10 |
+
5. Deterministic input — the prompt is built from the --explain
|
| 11 |
+
derivation trace, not free-form. The LLM gets structured math,
|
| 12 |
+
not prose.
|
| 13 |
+
6. The LLM's job is to CRITIQUE, not to REWRITE. The prompt
|
| 14 |
+
explicitly forbids generating new numbers.
|
| 15 |
+
|
| 16 |
+
Environment variables:
|
| 17 |
+
LLM_CAL_REVIEWER_API_KEY (required)
|
| 18 |
+
LLM_CAL_REVIEWER_BASE_URL (default: https://api.openai.com/v1)
|
| 19 |
+
LLM_CAL_REVIEWER_MODEL (default: gpt-4o)
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
from __future__ import annotations
|
| 23 |
+
|
| 24 |
+
import os
|
| 25 |
+
from dataclasses import dataclass
|
| 26 |
+
from typing import Literal
|
| 27 |
+
|
| 28 |
+
import httpx
|
| 29 |
+
|
| 30 |
+
from llm_cal.core.explain import ExplainEntry
|
| 31 |
+
|
| 32 |
+
Locale = Literal["en", "zh"]
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
@dataclass(frozen=True)
|
| 36 |
+
class LLMReviewResult:
|
| 37 |
+
ok: bool
|
| 38 |
+
content: str | None
|
| 39 |
+
error: str | None
|
| 40 |
+
model: str
|
| 41 |
+
base_url: str
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def run_review(
|
| 45 |
+
entries: list[ExplainEntry],
|
| 46 |
+
locale: Locale,
|
| 47 |
+
timeout_s: float = 60.0,
|
| 48 |
+
) -> LLMReviewResult:
|
| 49 |
+
"""Send the derivation trace to an LLM for audit.
|
| 50 |
+
|
| 51 |
+
Returns a LLMReviewResult. Never raises — always returns a result
|
| 52 |
+
object even on failure.
|
| 53 |
+
"""
|
| 54 |
+
api_key = os.environ.get("LLM_CAL_REVIEWER_API_KEY")
|
| 55 |
+
base_url = os.environ.get("LLM_CAL_REVIEWER_BASE_URL", "https://api.openai.com/v1").rstrip("/")
|
| 56 |
+
model = os.environ.get("LLM_CAL_REVIEWER_MODEL", "gpt-4o")
|
| 57 |
+
|
| 58 |
+
if not api_key:
|
| 59 |
+
return LLMReviewResult(
|
| 60 |
+
ok=False,
|
| 61 |
+
content=None,
|
| 62 |
+
error=(
|
| 63 |
+
"LLM_CAL_REVIEWER_API_KEY env var not set. "
|
| 64 |
+
"Set it to the API key of an OpenAI-compatible endpoint "
|
| 65 |
+
"(OpenAI, DeepSeek, Moonshot, Zhipu, etc.)."
|
| 66 |
+
),
|
| 67 |
+
model=model,
|
| 68 |
+
base_url=base_url,
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
prompt = _build_prompt(entries, locale)
|
| 72 |
+
|
| 73 |
+
try:
|
| 74 |
+
with httpx.Client(timeout=timeout_s) as client:
|
| 75 |
+
resp = client.post(
|
| 76 |
+
f"{base_url}/chat/completions",
|
| 77 |
+
headers={
|
| 78 |
+
"Authorization": f"Bearer {api_key}",
|
| 79 |
+
"Content-Type": "application/json",
|
| 80 |
+
},
|
| 81 |
+
json={
|
| 82 |
+
"model": model,
|
| 83 |
+
"messages": [
|
| 84 |
+
{"role": "system", "content": _system_prompt(locale)},
|
| 85 |
+
{"role": "user", "content": prompt},
|
| 86 |
+
],
|
| 87 |
+
"temperature": 0.1,
|
| 88 |
+
"max_tokens": 6000,
|
| 89 |
+
},
|
| 90 |
+
)
|
| 91 |
+
except (httpx.TimeoutException, httpx.ConnectError) as e:
|
| 92 |
+
return LLMReviewResult(
|
| 93 |
+
ok=False,
|
| 94 |
+
content=None,
|
| 95 |
+
error=f"{type(e).__name__}: {e}",
|
| 96 |
+
model=model,
|
| 97 |
+
base_url=base_url,
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
if resp.status_code != 200:
|
| 101 |
+
return LLMReviewResult(
|
| 102 |
+
ok=False,
|
| 103 |
+
content=None,
|
| 104 |
+
error=f"HTTP {resp.status_code}: {resp.text[:500]}",
|
| 105 |
+
model=model,
|
| 106 |
+
base_url=base_url,
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
try:
|
| 110 |
+
data = resp.json()
|
| 111 |
+
content = data["choices"][0]["message"]["content"]
|
| 112 |
+
except (KeyError, ValueError) as e:
|
| 113 |
+
return LLMReviewResult(
|
| 114 |
+
ok=False,
|
| 115 |
+
content=None,
|
| 116 |
+
error=f"Malformed response: {type(e).__name__}: {e}",
|
| 117 |
+
model=model,
|
| 118 |
+
base_url=base_url,
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
return LLMReviewResult(ok=True, content=content, error=None, model=model, base_url=base_url)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def _system_prompt(locale: Locale) -> str:
|
| 125 |
+
if locale == "zh":
|
| 126 |
+
return (
|
| 127 |
+
"你是一个大模型推理硬件计算工具的独立审计者。工具产出确定性的推导链,"
|
| 128 |
+
"你的工作是发现数学错误、不合理假设或遗漏。你不负责重新计算,"
|
| 129 |
+
"只负责评论和确认。输出简体中文。"
|
| 130 |
+
)
|
| 131 |
+
return (
|
| 132 |
+
"You are an independent auditor for a deterministic LLM inference hardware "
|
| 133 |
+
"calculator. The tool produces a derivation trace; your job is to find math "
|
| 134 |
+
"errors, unreasonable assumptions, or missing considerations. You do NOT "
|
| 135 |
+
"recalculate; you only critique and confirm."
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def _build_prompt(entries: list[ExplainEntry], locale: Locale) -> str:
|
| 140 |
+
trace = "\n\n".join(_format_entry(e) for e in entries)
|
| 141 |
+
if locale == "zh":
|
| 142 |
+
return _prompt_zh(trace)
|
| 143 |
+
return _prompt_en(trace)
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def _format_entry(entry: ExplainEntry) -> str:
|
| 147 |
+
parts: list[str] = [f"## {entry.heading}"]
|
| 148 |
+
parts.append(f"Formula:\n{entry.formula}")
|
| 149 |
+
if entry.inputs:
|
| 150 |
+
parts.append("Inputs:")
|
| 151 |
+
for inp in entry.inputs:
|
| 152 |
+
note = f" ({inp.note})" if inp.note else ""
|
| 153 |
+
parts.append(f" - {inp.name} = {inp.value} {inp.label}{note}")
|
| 154 |
+
if entry.steps:
|
| 155 |
+
parts.append("Steps:")
|
| 156 |
+
for step in entry.steps:
|
| 157 |
+
parts.append(f" {step}")
|
| 158 |
+
parts.append(f"Result: {entry.result}")
|
| 159 |
+
if entry.source:
|
| 160 |
+
parts.append(f"Source: {entry.source}")
|
| 161 |
+
return "\n".join(parts)
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def _prompt_en(trace: str) -> str:
|
| 165 |
+
return f"""The deterministic tool produced this derivation trace for one model evaluation. \
|
| 166 |
+
Audit it.
|
| 167 |
+
|
| 168 |
+
<DERIVATION_TRACE>
|
| 169 |
+
{trace}
|
| 170 |
+
</DERIVATION_TRACE>
|
| 171 |
+
|
| 172 |
+
Respond in this structure. If a section has nothing to flag, write "none".
|
| 173 |
+
|
| 174 |
+
## Critical issues
|
| 175 |
+
(math errors or wrong formulas — would give wrong final answer)
|
| 176 |
+
|
| 177 |
+
## Moderate concerns
|
| 178 |
+
(unreasonable assumptions, factors off by 2x+, missing TP/sharding effects, etc.)
|
| 179 |
+
|
| 180 |
+
## Minor notes
|
| 181 |
+
(clarifications, stylistic, optional improvements)
|
| 182 |
+
|
| 183 |
+
## Consensus check
|
| 184 |
+
(which ExplainEntry headings look correct? name them explicitly)
|
| 185 |
+
|
| 186 |
+
Rules:
|
| 187 |
+
- Cite specific ExplainEntry heading names. Be concrete.
|
| 188 |
+
- Do NOT produce new numbers. Only critique.
|
| 189 |
+
- If you don't know, say so. Do not hallucinate.
|
| 190 |
+
- All your output must be tagged as a second opinion, NOT authoritative."""
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
def _prompt_zh(trace: str) -> str:
|
| 194 |
+
return f"""下面是工具产出的一份完整推导链。请审计。
|
| 195 |
+
|
| 196 |
+
<DERIVATION_TRACE>
|
| 197 |
+
{trace}
|
| 198 |
+
</DERIVATION_TRACE>
|
| 199 |
+
|
| 200 |
+
按下面结构回复。没内容的段落写"无"。
|
| 201 |
+
|
| 202 |
+
## 关键错误
|
| 203 |
+
(数学错误或公式错误 —— 会导致最终答案错)
|
| 204 |
+
|
| 205 |
+
## 中度疑虑
|
| 206 |
+
(不合理假设、因子偏差 2x+、遗漏的 TP 分摊等)
|
| 207 |
+
|
| 208 |
+
## 次要备注
|
| 209 |
+
(澄清、风格、可选改进)
|
| 210 |
+
|
| 211 |
+
## 一致性核查
|
| 212 |
+
(哪些 ExplainEntry 标题看起来是对的?明确列出)
|
| 213 |
+
|
| 214 |
+
规则:
|
| 215 |
+
- 必须引用具体的 ExplainEntry 标题名。具体点。
|
| 216 |
+
- 不要产出新数字,只做评论。
|
| 217 |
+
- 不确定的地方直说。不要编造。
|
| 218 |
+
- 你的所有输出都只是 second opinion,不是权威答案。"""
|
src/llm_cal/model_source/__init__.py
ADDED
|
File without changes
|
src/llm_cal/model_source/auth.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Token discovery + user-friendly auth error messages."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def get_hf_token() -> str | None:
|
| 9 |
+
"""Read HF token from standard env vars.
|
| 10 |
+
|
| 11 |
+
`HF_TOKEN` wins over `HUGGING_FACE_HUB_TOKEN` for consistency with the
|
| 12 |
+
huggingface-cli default.
|
| 13 |
+
"""
|
| 14 |
+
return os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def get_modelscope_token() -> str | None:
|
| 18 |
+
return os.environ.get("MODELSCOPE_API_TOKEN") or os.environ.get("MODELSCOPE_TOKEN")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def hf_auth_error_message(model_id: str) -> str:
|
| 22 |
+
return (
|
| 23 |
+
f"Model '{model_id}' requires authentication (gated or private).\n"
|
| 24 |
+
"Set HF_TOKEN env var or run: huggingface-cli login"
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def modelscope_auth_error_message(model_id: str) -> str:
|
| 29 |
+
# Chinese user-facing message — full-width punctuation is intentional.
|
| 30 |
+
return (
|
| 31 |
+
f"模型 '{model_id}' 需要登录(gated 或 私有)。\n"
|
| 32 |
+
"设置 MODELSCOPE_API_TOKEN 环境变量,或执行:modelscope login"
|
| 33 |
+
)
|
src/llm_cal/model_source/base.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""ModelSource ABC — HF and ModelScope implement this."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from abc import ABC, abstractmethod
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from typing import Any
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@dataclass(frozen=True)
|
| 11 |
+
class SiblingFile:
|
| 12 |
+
"""One file in the model repo. `size` is bytes, or None if unknown."""
|
| 13 |
+
|
| 14 |
+
filename: str
|
| 15 |
+
size: int | None
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
@dataclass(frozen=True)
|
| 19 |
+
class ModelArtifact:
|
| 20 |
+
"""The raw material a ModelSource returns.
|
| 21 |
+
|
| 22 |
+
We do NOT interpret anything here — interpretation lives in `architecture/`
|
| 23 |
+
and `weight_analyzer/`. This is the thin "fetch" layer.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
source: str # "huggingface" | "modelscope"
|
| 27 |
+
model_id: str
|
| 28 |
+
commit_sha: str | None # HF provides this; used as cache key component
|
| 29 |
+
config: dict[str, Any] # parsed config.json
|
| 30 |
+
siblings: tuple[SiblingFile, ...] # all files in the repo
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class ModelNotFoundError(Exception):
|
| 34 |
+
"""Model id does not exist on this source."""
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class AuthRequiredError(Exception):
|
| 38 |
+
"""Model is gated / private — user must set a token."""
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class SourceUnavailableError(Exception):
|
| 42 |
+
"""Network error, timeout, rate limit, etc."""
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class ModelSource(ABC):
|
| 46 |
+
"""Abstract interface for HF / ModelScope / future sources."""
|
| 47 |
+
|
| 48 |
+
name: str # subclasses override
|
| 49 |
+
|
| 50 |
+
@abstractmethod
|
| 51 |
+
def fetch(self, model_id: str) -> ModelArtifact:
|
| 52 |
+
"""Fetch config.json + siblings for the given model.
|
| 53 |
+
|
| 54 |
+
Raises:
|
| 55 |
+
ModelNotFoundError: 404.
|
| 56 |
+
AuthRequiredError: 401/403 (gated/private).
|
| 57 |
+
SourceUnavailableError: 429, 5xx, timeout, network down.
|
| 58 |
+
"""
|
src/llm_cal/model_source/huggingface.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""HuggingFace source. Uses `huggingface_hub` for metadata + `httpx` for config fetch.
|
| 2 |
+
|
| 3 |
+
Anti-pattern warning: do NOT call `list_repo_files()` then head-request each file.
|
| 4 |
+
Always use `model_info(files_metadata=True)` which returns all sibling sizes in
|
| 5 |
+
ONE request. Verified in `tests/test_hf.py` by asserting HTTP call count.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import json
|
| 11 |
+
from typing import Any
|
| 12 |
+
|
| 13 |
+
import httpx
|
| 14 |
+
from huggingface_hub import HfApi
|
| 15 |
+
from huggingface_hub.utils import (
|
| 16 |
+
GatedRepoError,
|
| 17 |
+
HfHubHTTPError,
|
| 18 |
+
RepositoryNotFoundError,
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
from llm_cal.model_source.auth import get_hf_token, hf_auth_error_message
|
| 22 |
+
from llm_cal.model_source.base import (
|
| 23 |
+
AuthRequiredError,
|
| 24 |
+
ModelArtifact,
|
| 25 |
+
ModelNotFoundError,
|
| 26 |
+
ModelSource,
|
| 27 |
+
SiblingFile,
|
| 28 |
+
SourceUnavailableError,
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
_CONFIG_URL = "https://huggingface.co/{model_id}/resolve/{revision}/config.json"
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class HuggingFaceSource(ModelSource):
|
| 35 |
+
name = "huggingface"
|
| 36 |
+
|
| 37 |
+
def __init__(self, endpoint: str | None = None, timeout_s: float = 30.0) -> None:
|
| 38 |
+
# huggingface_hub picks up HF_ENDPOINT env; we pass through for explicitness
|
| 39 |
+
self._api = HfApi(endpoint=endpoint, token=get_hf_token())
|
| 40 |
+
self._timeout_s = timeout_s
|
| 41 |
+
self._endpoint = endpoint or "https://huggingface.co"
|
| 42 |
+
|
| 43 |
+
def fetch(self, model_id: str) -> ModelArtifact:
|
| 44 |
+
token = get_hf_token()
|
| 45 |
+
|
| 46 |
+
# Step 1: siblings + commit sha in ONE request.
|
| 47 |
+
# CRITICAL: files_metadata=True — see module docstring.
|
| 48 |
+
try:
|
| 49 |
+
info = self._api.model_info(
|
| 50 |
+
repo_id=model_id,
|
| 51 |
+
files_metadata=True,
|
| 52 |
+
token=token,
|
| 53 |
+
)
|
| 54 |
+
except RepositoryNotFoundError as e:
|
| 55 |
+
raise ModelNotFoundError(f"Model '{model_id}' not found on HuggingFace.") from e
|
| 56 |
+
except GatedRepoError as e:
|
| 57 |
+
raise AuthRequiredError(hf_auth_error_message(model_id)) from e
|
| 58 |
+
except HfHubHTTPError as e:
|
| 59 |
+
status = getattr(e.response, "status_code", None)
|
| 60 |
+
if status in (401, 403):
|
| 61 |
+
raise AuthRequiredError(hf_auth_error_message(model_id)) from e
|
| 62 |
+
if status == 429:
|
| 63 |
+
retry = e.response.headers.get("Retry-After", "unknown")
|
| 64 |
+
raise SourceUnavailableError(
|
| 65 |
+
f"HuggingFace rate limit (429). Retry-After: {retry}s. "
|
| 66 |
+
"Setting HF_TOKEN increases your quota."
|
| 67 |
+
) from e
|
| 68 |
+
raise SourceUnavailableError(f"HuggingFace error ({status}): {e}") from e
|
| 69 |
+
except (httpx.TimeoutException, TimeoutError) as e:
|
| 70 |
+
raise SourceUnavailableError(
|
| 71 |
+
f"HuggingFace request timed out after {self._timeout_s}s."
|
| 72 |
+
) from e
|
| 73 |
+
|
| 74 |
+
siblings = tuple(
|
| 75 |
+
SiblingFile(filename=s.rfilename, size=s.size) for s in (info.siblings or [])
|
| 76 |
+
)
|
| 77 |
+
commit_sha = info.sha
|
| 78 |
+
|
| 79 |
+
# Step 2: fetch config.json. If commit sha is available, pin to it so we don't
|
| 80 |
+
# race with repo updates between the two calls.
|
| 81 |
+
config = self._fetch_config(model_id, commit_sha or "main", token)
|
| 82 |
+
|
| 83 |
+
return ModelArtifact(
|
| 84 |
+
source=self.name,
|
| 85 |
+
model_id=model_id,
|
| 86 |
+
commit_sha=commit_sha,
|
| 87 |
+
config=config,
|
| 88 |
+
siblings=siblings,
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
def _fetch_config(self, model_id: str, revision: str, token: str | None) -> dict[str, Any]:
|
| 92 |
+
url = _CONFIG_URL.format(model_id=model_id, revision=revision)
|
| 93 |
+
headers = {"Authorization": f"Bearer {token}"} if token else {}
|
| 94 |
+
try:
|
| 95 |
+
resp = httpx.get(url, headers=headers, timeout=self._timeout_s, follow_redirects=True)
|
| 96 |
+
except (httpx.TimeoutException, httpx.ConnectError) as e:
|
| 97 |
+
raise SourceUnavailableError(f"config.json fetch failed: {e}") from e
|
| 98 |
+
|
| 99 |
+
if resp.status_code == 404:
|
| 100 |
+
raise ModelNotFoundError(
|
| 101 |
+
f"Model '{model_id}' exists but has no config.json. "
|
| 102 |
+
"May be a GGUF-only or dataset repo (not supported in v0.1)."
|
| 103 |
+
)
|
| 104 |
+
if resp.status_code in (401, 403):
|
| 105 |
+
raise AuthRequiredError(hf_auth_error_message(model_id))
|
| 106 |
+
if resp.status_code == 429:
|
| 107 |
+
retry = resp.headers.get("Retry-After", "unknown")
|
| 108 |
+
raise SourceUnavailableError(f"HuggingFace rate limit (429). Retry-After: {retry}s.")
|
| 109 |
+
if resp.status_code >= 400:
|
| 110 |
+
raise SourceUnavailableError(f"config.json fetch returned HTTP {resp.status_code}")
|
| 111 |
+
|
| 112 |
+
try:
|
| 113 |
+
parsed: dict[str, Any] = json.loads(resp.text)
|
| 114 |
+
except json.JSONDecodeError as e:
|
| 115 |
+
raise SourceUnavailableError(
|
| 116 |
+
f"config.json is not valid JSON (line {e.lineno} col {e.colno}): {e.msg}"
|
| 117 |
+
) from e
|
| 118 |
+
return parsed
|
src/llm_cal/model_source/modelscope.py
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""ModelScope source — REST-only via httpx.
|
| 2 |
+
|
| 3 |
+
Decision: Option B from ADR-001. We don't need the official `modelscope` SDK
|
| 4 |
+
because llm-cal only requires three things:
|
| 5 |
+
1. List repo files + sizes (one API call)
|
| 6 |
+
2. Fetch config.json (one API call)
|
| 7 |
+
3. Range-GET a safetensors header (handled by safetensors_reader)
|
| 8 |
+
|
| 9 |
+
The SDK pulls heavy ML deps by default (torch / tf for some install paths).
|
| 10 |
+
REST keeps the install footprint flat, mirrors the existing httpx hot path,
|
| 11 |
+
and gives us identical exception semantics across HF + MS.
|
| 12 |
+
|
| 13 |
+
Endpoints (verified against modelscope.cn public docs, 2026-04):
|
| 14 |
+
* GET /api/v1/models/{owner}/{name} — model meta
|
| 15 |
+
* GET /api/v1/models/{owner}/{name}/repo/files?Recursive=true
|
| 16 |
+
— file tree + sizes
|
| 17 |
+
* GET /api/v1/models/{owner}/{name}/repo?FilePath=...&Revision=...
|
| 18 |
+
— raw file content
|
| 19 |
+
|
| 20 |
+
ModelScope wraps every response in a {Code, Message, Data, Success} envelope.
|
| 21 |
+
Field casing is PascalCase. We parse defensively — fields may evolve.
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
from __future__ import annotations
|
| 25 |
+
|
| 26 |
+
import json
|
| 27 |
+
from typing import Any
|
| 28 |
+
|
| 29 |
+
import httpx
|
| 30 |
+
|
| 31 |
+
from llm_cal.model_source.auth import (
|
| 32 |
+
get_modelscope_token,
|
| 33 |
+
modelscope_auth_error_message,
|
| 34 |
+
)
|
| 35 |
+
from llm_cal.model_source.base import (
|
| 36 |
+
AuthRequiredError,
|
| 37 |
+
ModelArtifact,
|
| 38 |
+
ModelNotFoundError,
|
| 39 |
+
ModelSource,
|
| 40 |
+
SiblingFile,
|
| 41 |
+
SourceUnavailableError,
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
DEFAULT_ENDPOINT = "https://www.modelscope.cn"
|
| 45 |
+
DEFAULT_REVISION = "master"
|
| 46 |
+
|
| 47 |
+
_INFO_PATH = "/api/v1/models/{model_id}"
|
| 48 |
+
_FILES_PATH = "/api/v1/models/{model_id}/repo/files"
|
| 49 |
+
_RAW_PATH = "/api/v1/models/{model_id}/repo"
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class ModelScopeSource(ModelSource):
|
| 53 |
+
name = "modelscope"
|
| 54 |
+
|
| 55 |
+
def __init__(
|
| 56 |
+
self,
|
| 57 |
+
endpoint: str | None = None,
|
| 58 |
+
timeout_s: float = 30.0,
|
| 59 |
+
revision: str = DEFAULT_REVISION,
|
| 60 |
+
) -> None:
|
| 61 |
+
self._endpoint = (endpoint or DEFAULT_ENDPOINT).rstrip("/")
|
| 62 |
+
self._timeout_s = timeout_s
|
| 63 |
+
self._revision = revision
|
| 64 |
+
|
| 65 |
+
def fetch(self, model_id: str) -> ModelArtifact:
|
| 66 |
+
token = get_modelscope_token()
|
| 67 |
+
headers = self._auth_headers(token)
|
| 68 |
+
|
| 69 |
+
# Step 1: model info — gives us LatestSha (commit pin) when available.
|
| 70 |
+
# We tolerate missing info; fall back to revision="master" so that the
|
| 71 |
+
# file list + config calls still work.
|
| 72 |
+
commit_sha = self._fetch_commit_sha(model_id, headers)
|
| 73 |
+
|
| 74 |
+
# Step 2: file tree with sizes. ONE call, recursive, includes sub-folders.
|
| 75 |
+
siblings = self._list_files(model_id, commit_sha or self._revision, headers)
|
| 76 |
+
|
| 77 |
+
# Step 3: config.json. Pin to the commit sha when we have it so two
|
| 78 |
+
# back-to-back calls don't race against a repo update.
|
| 79 |
+
config = self._fetch_config(model_id, commit_sha or self._revision, headers)
|
| 80 |
+
|
| 81 |
+
return ModelArtifact(
|
| 82 |
+
source=self.name,
|
| 83 |
+
model_id=model_id,
|
| 84 |
+
commit_sha=commit_sha,
|
| 85 |
+
config=config,
|
| 86 |
+
siblings=siblings,
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
# ------------------------------------------------------------------ helpers
|
| 90 |
+
|
| 91 |
+
def _auth_headers(self, token: str | None) -> dict[str, str]:
|
| 92 |
+
return {"Authorization": f"Bearer {token}"} if token else {}
|
| 93 |
+
|
| 94 |
+
def _fetch_commit_sha(self, model_id: str, headers: dict[str, str]) -> str | None:
|
| 95 |
+
url = f"{self._endpoint}{_INFO_PATH.format(model_id=model_id)}"
|
| 96 |
+
try:
|
| 97 |
+
resp = httpx.get(
|
| 98 |
+
url, headers=headers, timeout=self._timeout_s, follow_redirects=True
|
| 99 |
+
)
|
| 100 |
+
except (httpx.TimeoutException, httpx.ConnectError, httpx.HTTPError):
|
| 101 |
+
# Soft fail — commit sha is best-effort. Caller will use "master".
|
| 102 |
+
return None
|
| 103 |
+
|
| 104 |
+
if resp.status_code != 200:
|
| 105 |
+
return None
|
| 106 |
+
try:
|
| 107 |
+
payload = resp.json()
|
| 108 |
+
except json.JSONDecodeError:
|
| 109 |
+
return None
|
| 110 |
+
|
| 111 |
+
data = payload.get("Data") if isinstance(payload, dict) else None
|
| 112 |
+
if not isinstance(data, dict):
|
| 113 |
+
return None
|
| 114 |
+
# Field name has bounced between LatestSha / latest_sha / Revision in
|
| 115 |
+
# historical docs; check several.
|
| 116 |
+
for key in ("LatestSha", "latest_sha", "Revision", "Sha"):
|
| 117 |
+
v = data.get(key)
|
| 118 |
+
if isinstance(v, str) and v:
|
| 119 |
+
return v
|
| 120 |
+
return None
|
| 121 |
+
|
| 122 |
+
def _list_files(
|
| 123 |
+
self, model_id: str, revision: str, headers: dict[str, str]
|
| 124 |
+
) -> tuple[SiblingFile, ...]:
|
| 125 |
+
url = f"{self._endpoint}{_FILES_PATH.format(model_id=model_id)}"
|
| 126 |
+
params = {"Recursive": "true", "Revision": revision}
|
| 127 |
+
try:
|
| 128 |
+
resp = httpx.get(
|
| 129 |
+
url,
|
| 130 |
+
headers=headers,
|
| 131 |
+
params=params,
|
| 132 |
+
timeout=self._timeout_s,
|
| 133 |
+
follow_redirects=True,
|
| 134 |
+
)
|
| 135 |
+
except (httpx.TimeoutException, httpx.ConnectError) as e:
|
| 136 |
+
raise SourceUnavailableError(f"ModelScope file list failed: {e}") from e
|
| 137 |
+
|
| 138 |
+
self._raise_for_status(resp, model_id, what="file list")
|
| 139 |
+
|
| 140 |
+
try:
|
| 141 |
+
payload = resp.json()
|
| 142 |
+
except json.JSONDecodeError as e:
|
| 143 |
+
raise SourceUnavailableError(
|
| 144 |
+
f"ModelScope file list returned non-JSON: {e}"
|
| 145 |
+
) from e
|
| 146 |
+
|
| 147 |
+
files = _extract_files(payload)
|
| 148 |
+
if files is None:
|
| 149 |
+
raise SourceUnavailableError(
|
| 150 |
+
"ModelScope file list payload had unexpected shape — "
|
| 151 |
+
"neither Data.Files nor Data is a list."
|
| 152 |
+
)
|
| 153 |
+
return tuple(
|
| 154 |
+
SiblingFile(filename=f["Path"], size=f.get("Size"))
|
| 155 |
+
for f in files
|
| 156 |
+
if isinstance(f, dict) and isinstance(f.get("Path"), str)
|
| 157 |
+
# Only include blobs (not directories). Type=tree means folder.
|
| 158 |
+
and f.get("Type", "blob") != "tree"
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
def _fetch_config(
|
| 162 |
+
self, model_id: str, revision: str, headers: dict[str, str]
|
| 163 |
+
) -> dict[str, Any]:
|
| 164 |
+
url = f"{self._endpoint}{_RAW_PATH.format(model_id=model_id)}"
|
| 165 |
+
params = {"FilePath": "config.json", "Revision": revision}
|
| 166 |
+
try:
|
| 167 |
+
resp = httpx.get(
|
| 168 |
+
url,
|
| 169 |
+
headers=headers,
|
| 170 |
+
params=params,
|
| 171 |
+
timeout=self._timeout_s,
|
| 172 |
+
follow_redirects=True,
|
| 173 |
+
)
|
| 174 |
+
except (httpx.TimeoutException, httpx.ConnectError) as e:
|
| 175 |
+
raise SourceUnavailableError(f"config.json fetch failed: {e}") from e
|
| 176 |
+
|
| 177 |
+
self._raise_for_status(resp, model_id, what="config.json")
|
| 178 |
+
|
| 179 |
+
try:
|
| 180 |
+
parsed: Any = json.loads(resp.text)
|
| 181 |
+
except json.JSONDecodeError as e:
|
| 182 |
+
raise SourceUnavailableError(
|
| 183 |
+
f"config.json is not valid JSON (line {e.lineno} col {e.colno}): {e.msg}"
|
| 184 |
+
) from e
|
| 185 |
+
if not isinstance(parsed, dict):
|
| 186 |
+
raise SourceUnavailableError(
|
| 187 |
+
"config.json did not parse to a JSON object."
|
| 188 |
+
)
|
| 189 |
+
return parsed
|
| 190 |
+
|
| 191 |
+
def _raise_for_status(
|
| 192 |
+
self, resp: httpx.Response, model_id: str, what: str
|
| 193 |
+
) -> None:
|
| 194 |
+
if resp.status_code == 200:
|
| 195 |
+
return
|
| 196 |
+
if resp.status_code == 404:
|
| 197 |
+
raise ModelNotFoundError(
|
| 198 |
+
f"Model '{model_id}' not found on ModelScope ({what})."
|
| 199 |
+
)
|
| 200 |
+
if resp.status_code in (401, 403):
|
| 201 |
+
raise AuthRequiredError(modelscope_auth_error_message(model_id))
|
| 202 |
+
if resp.status_code == 429:
|
| 203 |
+
retry = resp.headers.get("Retry-After", "unknown")
|
| 204 |
+
raise SourceUnavailableError(
|
| 205 |
+
f"ModelScope rate limit (429). Retry-After: {retry}s. "
|
| 206 |
+
"Setting MODELSCOPE_API_TOKEN increases your quota."
|
| 207 |
+
)
|
| 208 |
+
raise SourceUnavailableError(
|
| 209 |
+
f"ModelScope {what} returned HTTP {resp.status_code}"
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
def _extract_files(payload: Any) -> list[Any] | None:
|
| 214 |
+
"""Pull the file list out of the wrapped ModelScope envelope.
|
| 215 |
+
|
| 216 |
+
Tolerates two known shapes:
|
| 217 |
+
A) {Data: {Files: [...]}} — most common
|
| 218 |
+
B) {Data: [...]} — older / list-only endpoints
|
| 219 |
+
"""
|
| 220 |
+
if not isinstance(payload, dict):
|
| 221 |
+
return None
|
| 222 |
+
data = payload.get("Data")
|
| 223 |
+
if isinstance(data, dict):
|
| 224 |
+
files = data.get("Files")
|
| 225 |
+
if isinstance(files, list):
|
| 226 |
+
return files
|
| 227 |
+
if isinstance(data, list):
|
| 228 |
+
return data
|
| 229 |
+
return None
|
src/llm_cal/output/__init__.py
ADDED
|
File without changes
|
src/llm_cal/output/formatter.py
ADDED
|
@@ -0,0 +1,665 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Rich-formatted, fully i18n'd output for EvaluationReport.
|
| 2 |
+
|
| 3 |
+
Every visible string flows through `common.i18n.t()`. To add another locale,
|
| 4 |
+
add entries to `_MESSAGES` in i18n.py; no changes here needed.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
from typing import Any
|
| 10 |
+
|
| 11 |
+
from rich.console import Console
|
| 12 |
+
from rich.panel import Panel
|
| 13 |
+
from rich.table import Table
|
| 14 |
+
from rich.text import Text
|
| 15 |
+
|
| 16 |
+
from llm_cal.common.i18n import get_locale, t
|
| 17 |
+
from llm_cal.core.evaluator import EvaluationReport
|
| 18 |
+
from llm_cal.engine_compat.loader import EngineCompatEntry, EngineFlag, EngineSource
|
| 19 |
+
from llm_cal.fleet.planner import FleetRecommendation
|
| 20 |
+
from llm_cal.hardware.loader import GPUDatabase
|
| 21 |
+
from llm_cal.output.labels import AnnotatedValue, Label
|
| 22 |
+
|
| 23 |
+
_LABEL_STYLES: dict[Label, str] = {
|
| 24 |
+
Label.VERIFIED: "bold green",
|
| 25 |
+
Label.INFERRED: "cyan",
|
| 26 |
+
Label.ESTIMATED: "yellow",
|
| 27 |
+
Label.CITED: "blue",
|
| 28 |
+
Label.UNVERIFIED: "bold yellow",
|
| 29 |
+
Label.UNKNOWN: "dim red",
|
| 30 |
+
Label.LLM_OPINION: "magenta",
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def format_tag(av: AnnotatedValue[Any]) -> Text:
|
| 35 |
+
style = _LABEL_STYLES.get(av.label, "white")
|
| 36 |
+
display = t(f"label.{av.label.value}") # localized; falls back to English
|
| 37 |
+
return Text(f"[{display}]", style=style)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _fmt_bytes(n: int) -> str:
|
| 41 |
+
if n >= 1_000_000_000:
|
| 42 |
+
return f"{n / 1_000_000_000:.2f} GB"
|
| 43 |
+
if n >= 1_000_000:
|
| 44 |
+
return f"{n / 1_000_000:.2f} MB"
|
| 45 |
+
if n >= 1_000:
|
| 46 |
+
return f"{n / 1_000:.2f} KB"
|
| 47 |
+
return f"{n} B"
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _fmt_params(n: int) -> str:
|
| 51 |
+
if n >= 1_000_000_000:
|
| 52 |
+
return f"{n / 1_000_000_000:.2f}B"
|
| 53 |
+
if n >= 1_000_000:
|
| 54 |
+
return f"{n / 1_000_000:.2f}M"
|
| 55 |
+
return str(n)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def render(report: EvaluationReport, console: Console | None = None) -> None:
|
| 59 |
+
console = console or Console()
|
| 60 |
+
|
| 61 |
+
console.print()
|
| 62 |
+
sha_frag = f" @ {report.commit_sha[:7]}" if report.commit_sha else ""
|
| 63 |
+
console.print(
|
| 64 |
+
Panel.fit(
|
| 65 |
+
f"[bold cyan]{report.model_id}[/bold cyan] "
|
| 66 |
+
f"[dim]{t('panel.via')} {report.source}{sha_frag}[/dim]",
|
| 67 |
+
border_style="cyan",
|
| 68 |
+
)
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
_render_architecture(report, console)
|
| 72 |
+
_render_weight(report, console)
|
| 73 |
+
_render_kv_cache(report, console)
|
| 74 |
+
_render_engine_compat(report, console)
|
| 75 |
+
_render_hardware(report, console)
|
| 76 |
+
_render_fleet(report, console)
|
| 77 |
+
_render_performance(report, console)
|
| 78 |
+
_render_command(report, console)
|
| 79 |
+
_render_label_legend(console)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def _render_architecture(report: EvaluationReport, console: Console) -> None:
|
| 83 |
+
p = report.profile
|
| 84 |
+
table = Table(title=t("section.architecture"), show_header=False, box=None, padding=(0, 2))
|
| 85 |
+
table.add_column("field", style="dim")
|
| 86 |
+
table.add_column("value")
|
| 87 |
+
table.add_column("label")
|
| 88 |
+
|
| 89 |
+
table.add_row(t("arch.model_type"), p.model_type or t("arch.none"), _verified_tag())
|
| 90 |
+
table.add_row(t("arch.family"), p.family.value, _verified_tag())
|
| 91 |
+
table.add_row(
|
| 92 |
+
t("arch.confidence"), p.confidence.value, Text(f"[{p.confidence.value}]", style="magenta")
|
| 93 |
+
)
|
| 94 |
+
table.add_row(t("arch.layers"), str(p.num_hidden_layers), _verified_tag())
|
| 95 |
+
table.add_row(t("arch.hidden_size"), str(p.hidden_size), _verified_tag())
|
| 96 |
+
table.add_row(t("arch.vocab_size"), f"{p.vocab_size:,}", _verified_tag())
|
| 97 |
+
|
| 98 |
+
if p.attention is not None:
|
| 99 |
+
table.add_row(
|
| 100 |
+
t("arch.attention"),
|
| 101 |
+
t(
|
| 102 |
+
"arch.attn_summary",
|
| 103 |
+
variant=p.attention.variant,
|
| 104 |
+
heads=p.attention.num_heads,
|
| 105 |
+
kv_heads=p.attention.num_kv_heads,
|
| 106 |
+
head_dim=p.attention.head_dim,
|
| 107 |
+
),
|
| 108 |
+
_verified_tag(),
|
| 109 |
+
)
|
| 110 |
+
if p.attention.compress_ratios:
|
| 111 |
+
ratios = p.attention.compress_ratios
|
| 112 |
+
table.add_row(
|
| 113 |
+
t("arch.compress_ratios"),
|
| 114 |
+
t(
|
| 115 |
+
"arch.compress_ratios_summary",
|
| 116 |
+
n=len(ratios),
|
| 117 |
+
dense=sum(1 for r in ratios if r == 0),
|
| 118 |
+
),
|
| 119 |
+
_verified_tag(),
|
| 120 |
+
)
|
| 121 |
+
if p.moe is not None:
|
| 122 |
+
table.add_row(
|
| 123 |
+
t("arch.moe"),
|
| 124 |
+
t(
|
| 125 |
+
"arch.moe_summary",
|
| 126 |
+
routed=p.moe.num_routed_experts,
|
| 127 |
+
shared=p.moe.num_shared_experts,
|
| 128 |
+
topk=p.moe.num_experts_per_tok,
|
| 129 |
+
),
|
| 130 |
+
_verified_tag(),
|
| 131 |
+
)
|
| 132 |
+
if p.sliding_window:
|
| 133 |
+
table.add_row(t("arch.sliding_window"), str(p.sliding_window), _verified_tag())
|
| 134 |
+
if p.position and p.position.max_position_embeddings:
|
| 135 |
+
table.add_row(
|
| 136 |
+
t("arch.max_position"),
|
| 137 |
+
f"{p.position.max_position_embeddings:,}",
|
| 138 |
+
_verified_tag(),
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
console.print(table)
|
| 142 |
+
if p.auxiliary.get("warning"):
|
| 143 |
+
console.print(f"[red]⚠ {p.auxiliary['warning']}[/red]")
|
| 144 |
+
if p.auxiliary.get("v0_1_unsupported"):
|
| 145 |
+
console.print(f"[yellow]⚠ {t('arch.unsupported_state_space')}[/yellow]")
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def _render_weight(report: EvaluationReport, console: Console) -> None:
|
| 149 |
+
table = Table(title=t("section.weights"), show_header=False, box=None, padding=(0, 2))
|
| 150 |
+
table.add_column("field", style="dim")
|
| 151 |
+
table.add_column("value")
|
| 152 |
+
table.add_column("label")
|
| 153 |
+
|
| 154 |
+
w = report.weight
|
| 155 |
+
table.add_row(
|
| 156 |
+
t("weights.safetensors_bytes"),
|
| 157 |
+
_fmt_bytes(w.total_bytes.value),
|
| 158 |
+
format_tag(w.total_bytes),
|
| 159 |
+
)
|
| 160 |
+
table.add_row(
|
| 161 |
+
t("weights.params_estimated"),
|
| 162 |
+
_fmt_params(report.total_params_estimate.value),
|
| 163 |
+
format_tag(report.total_params_estimate),
|
| 164 |
+
)
|
| 165 |
+
if w.bits_per_param is not None:
|
| 166 |
+
table.add_row(
|
| 167 |
+
t("weights.bits_per_param"),
|
| 168 |
+
f"{w.bits_per_param.value:.2f}",
|
| 169 |
+
format_tag(w.bits_per_param),
|
| 170 |
+
)
|
| 171 |
+
table.add_row(
|
| 172 |
+
t("weights.quant_guess"),
|
| 173 |
+
str(w.quantization_guess.value),
|
| 174 |
+
format_tag(w.quantization_guess),
|
| 175 |
+
)
|
| 176 |
+
console.print(table)
|
| 177 |
+
|
| 178 |
+
r = report.reconciliation
|
| 179 |
+
if r.candidates:
|
| 180 |
+
rec_table = Table(
|
| 181 |
+
title=t("section.reconciliation"),
|
| 182 |
+
title_justify="left",
|
| 183 |
+
show_header=True,
|
| 184 |
+
header_style="dim",
|
| 185 |
+
box=None,
|
| 186 |
+
padding=(0, 2),
|
| 187 |
+
)
|
| 188 |
+
rec_table.add_column(t("recon.scheme"))
|
| 189 |
+
rec_table.add_column(t("recon.predicted"), justify="right")
|
| 190 |
+
rec_table.add_column(t("recon.delta"), justify="right")
|
| 191 |
+
rec_table.add_column(t("recon.error_pct"), justify="right")
|
| 192 |
+
for c in r.candidates[:6]:
|
| 193 |
+
direction = t("recon.over") if c.delta_bytes > 0 else t("recon.under")
|
| 194 |
+
rec_table.add_row(
|
| 195 |
+
c.scheme,
|
| 196 |
+
_fmt_bytes(c.predicted_bytes),
|
| 197 |
+
f"{_fmt_bytes(abs(c.delta_bytes))} {direction}",
|
| 198 |
+
f"{c.relative_error * 100:.1f}%",
|
| 199 |
+
)
|
| 200 |
+
console.print(rec_table)
|
| 201 |
+
console.print(f"[bold]{t('recon.best')}[/bold] {r.best.value} {format_tag(r.best)}")
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
def _render_kv_cache(report: EvaluationReport, console: Console) -> None:
|
| 205 |
+
if not report.kv_cache_by_context:
|
| 206 |
+
return
|
| 207 |
+
table = Table(
|
| 208 |
+
title=t("section.kv_cache"),
|
| 209 |
+
title_justify="left",
|
| 210 |
+
show_header=True,
|
| 211 |
+
header_style="dim",
|
| 212 |
+
box=None,
|
| 213 |
+
padding=(0, 2),
|
| 214 |
+
)
|
| 215 |
+
table.add_column(t("kv.context"))
|
| 216 |
+
table.add_column(t("kv.kv_cache"), justify="right")
|
| 217 |
+
table.add_column(t("kv.label"))
|
| 218 |
+
tokens_word = t("kv.tokens")
|
| 219 |
+
for ctx, av in report.kv_cache_by_context.items():
|
| 220 |
+
table.add_row(
|
| 221 |
+
f"{ctx:,} {tokens_word}",
|
| 222 |
+
_fmt_bytes(av.value),
|
| 223 |
+
format_tag(av),
|
| 224 |
+
)
|
| 225 |
+
console.print(table)
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
def _render_engine_compat(report: EvaluationReport, console: Console) -> None:
|
| 229 |
+
m = report.engine_match
|
| 230 |
+
if m is None:
|
| 231 |
+
console.print()
|
| 232 |
+
console.print(
|
| 233 |
+
f"[dim]{t('section.engine_compat')}:[/dim] [yellow]{t('engine.no_match')}[/yellow]"
|
| 234 |
+
)
|
| 235 |
+
return
|
| 236 |
+
|
| 237 |
+
table = Table(
|
| 238 |
+
title=f"{t('section.engine_compat')} — {m.engine}",
|
| 239 |
+
show_header=False,
|
| 240 |
+
box=None,
|
| 241 |
+
padding=(0, 2),
|
| 242 |
+
)
|
| 243 |
+
table.add_column("field", style="dim")
|
| 244 |
+
table.add_column("value")
|
| 245 |
+
table.add_column("label")
|
| 246 |
+
|
| 247 |
+
verif_label = _verif_label(m)
|
| 248 |
+
table.add_row(t("engine.version_spec"), m.version_spec, Text(""))
|
| 249 |
+
table.add_row(t("engine.support"), m.support, verif_label)
|
| 250 |
+
table.add_row(t("engine.verification"), m.verification_level, verif_label)
|
| 251 |
+
|
| 252 |
+
if m.required_flags:
|
| 253 |
+
lines = [_fmt_flag(f) for f in m.required_flags]
|
| 254 |
+
table.add_row(t("engine.required_flags"), "\n".join(lines), Text(""))
|
| 255 |
+
if m.optional_flags:
|
| 256 |
+
lines = [_fmt_flag(f) for f in m.optional_flags]
|
| 257 |
+
table.add_row(t("engine.optional_flags"), "\n".join(lines), Text(""))
|
| 258 |
+
|
| 259 |
+
caveats = m.caveats_zh if get_locale() == "zh" else m.caveats_en
|
| 260 |
+
if caveats:
|
| 261 |
+
table.add_row(t("engine.caveats"), "\n".join(f"• {c}" for c in caveats), Text(""))
|
| 262 |
+
|
| 263 |
+
if m.sources:
|
| 264 |
+
source_lines = [_fmt_source(s) for s in m.sources]
|
| 265 |
+
table.add_row(t("engine.sources"), "\n".join(source_lines), Text(""))
|
| 266 |
+
|
| 267 |
+
console.print(table)
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
def _render_hardware(report: EvaluationReport, console: Console) -> None:
|
| 271 |
+
console.print()
|
| 272 |
+
if report.gpu_spec is None:
|
| 273 |
+
msg = report.gpu_error or f"Unknown GPU '{report.gpu}'"
|
| 274 |
+
console.print(f"[bold red]{t('section.hardware')}:[/bold red] [red]{msg}[/red]")
|
| 275 |
+
return
|
| 276 |
+
|
| 277 |
+
spec = report.gpu_spec
|
| 278 |
+
locale = get_locale()
|
| 279 |
+
table = Table(
|
| 280 |
+
title=f"{t('section.hardware')} — {spec.id}",
|
| 281 |
+
show_header=False,
|
| 282 |
+
box=None,
|
| 283 |
+
padding=(0, 2),
|
| 284 |
+
)
|
| 285 |
+
table.add_column("field", style="dim")
|
| 286 |
+
table.add_column("value")
|
| 287 |
+
|
| 288 |
+
table.add_row(t("hw.memory"), f"{spec.memory_gb} GB HBM")
|
| 289 |
+
table.add_row(t("hw.nvlink_bandwidth"), f"{spec.nvlink_bandwidth_gbps} GB/s")
|
| 290 |
+
table.add_row(t("hw.fp16_tflops"), f"{spec.fp16_tflops:.0f} TFLOPS")
|
| 291 |
+
table.add_row(t("hw.fp8_support"), t("hw.bool_yes") if spec.fp8_support else t("hw.bool_no"))
|
| 292 |
+
table.add_row(t("hw.fp4_support"), t("hw.bool_yes") if spec.fp4_support else t("hw.bool_no"))
|
| 293 |
+
notes = spec.localized_notes(locale)
|
| 294 |
+
if notes:
|
| 295 |
+
table.add_row(t("hw.notes"), notes)
|
| 296 |
+
if spec.spec_source:
|
| 297 |
+
table.add_row(t("hw.spec_source"), spec.spec_source)
|
| 298 |
+
console.print(table)
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
def _render_fleet(report: EvaluationReport, console: Console) -> None:
|
| 302 |
+
f = report.fleet
|
| 303 |
+
if f is None:
|
| 304 |
+
if report.gpu_spec is None:
|
| 305 |
+
return # hardware section already surfaced the error
|
| 306 |
+
console.print(f"[dim]{t('fleet.gpu_spec_unknown')}[/dim]")
|
| 307 |
+
return
|
| 308 |
+
|
| 309 |
+
# Decide which context lengths to surface as concurrency columns.
|
| 310 |
+
ctx_cols = _select_concurrency_columns(f)
|
| 311 |
+
|
| 312 |
+
table = Table(
|
| 313 |
+
title=f"{t('section.fleet')} — {report.gpu_spec.id if report.gpu_spec else report.gpu}",
|
| 314 |
+
title_justify="left",
|
| 315 |
+
show_header=True,
|
| 316 |
+
header_style="dim",
|
| 317 |
+
box=None,
|
| 318 |
+
padding=(0, 2),
|
| 319 |
+
)
|
| 320 |
+
table.add_column(t("fleet.col.tier"))
|
| 321 |
+
table.add_column(t("fleet.col.gpus"), justify="right")
|
| 322 |
+
table.add_column(t("fleet.col.weight_per_gpu"), justify="right")
|
| 323 |
+
table.add_column(t("fleet.col.headroom_per_gpu"), justify="right")
|
| 324 |
+
for ctx in ctx_cols:
|
| 325 |
+
table.add_column(
|
| 326 |
+
t("fleet.col.concurrent_at_ctx", ctx=_fmt_ctx(ctx)),
|
| 327 |
+
justify="right",
|
| 328 |
+
)
|
| 329 |
+
|
| 330 |
+
for opt in f.options:
|
| 331 |
+
headroom = opt.usable_bytes_per_gpu - opt.weight_bytes_per_gpu
|
| 332 |
+
label_tier = t(f"fleet.tier.{opt.tier}")
|
| 333 |
+
marker = " ★" if opt.tier == f.best_tier else ""
|
| 334 |
+
row_style = None if opt.fits else "dim red"
|
| 335 |
+
conc_map = dict(opt.max_concurrent_by_context)
|
| 336 |
+
row = [
|
| 337 |
+
f"{label_tier}{marker}",
|
| 338 |
+
str(opt.gpu_count),
|
| 339 |
+
_fmt_bytes(opt.weight_bytes_per_gpu),
|
| 340 |
+
_fmt_bytes(headroom) if headroom > 0 else "—",
|
| 341 |
+
]
|
| 342 |
+
for ctx in ctx_cols:
|
| 343 |
+
n = conc_map.get(ctx, 0)
|
| 344 |
+
row.append(f"~{n}" if n > 0 else "✗")
|
| 345 |
+
table.add_row(*row, style=row_style)
|
| 346 |
+
|
| 347 |
+
console.print(table)
|
| 348 |
+
|
| 349 |
+
locale = get_locale()
|
| 350 |
+
note = f.constraint_note_zh if locale == "zh" else f.constraint_note_en
|
| 351 |
+
console.print(f"[dim]{t('fleet.constraint')} {note}[/dim]")
|
| 352 |
+
console.print(f"[dim]★ {t('fleet.best_marker')}[/dim]")
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
def _select_concurrency_columns(f: FleetRecommendation) -> list[int]:
|
| 356 |
+
"""Pick which context lengths become concurrency columns in the fleet table.
|
| 357 |
+
|
| 358 |
+
Rule: always include 128K if the model supports it; additionally include the
|
| 359 |
+
model's max context if it's larger than 128K. For shorter-context models,
|
| 360 |
+
fall back to 32K or whatever the max is.
|
| 361 |
+
"""
|
| 362 |
+
all_ctxs: set[int] = set()
|
| 363 |
+
for opt in f.options:
|
| 364 |
+
for ctx, _ in opt.max_concurrent_by_context:
|
| 365 |
+
all_ctxs.add(ctx)
|
| 366 |
+
if not all_ctxs:
|
| 367 |
+
return []
|
| 368 |
+
picks: list[int] = []
|
| 369 |
+
if 131_072 in all_ctxs:
|
| 370 |
+
picks.append(131_072)
|
| 371 |
+
max_ctx = max(all_ctxs)
|
| 372 |
+
if max_ctx > 131_072 and max_ctx not in picks:
|
| 373 |
+
picks.append(max_ctx)
|
| 374 |
+
if not picks:
|
| 375 |
+
picks.append(32_768 if 32_768 in all_ctxs else max_ctx)
|
| 376 |
+
return picks
|
| 377 |
+
|
| 378 |
+
|
| 379 |
+
def _fmt_ctx(ctx_tokens: int) -> str:
|
| 380 |
+
if ctx_tokens >= 1_000_000:
|
| 381 |
+
if ctx_tokens % 1_000_000 == 0:
|
| 382 |
+
return f"{ctx_tokens // 1_000_000}M"
|
| 383 |
+
return f"{ctx_tokens / 1_000_000:.1f}M"
|
| 384 |
+
if ctx_tokens >= 1024:
|
| 385 |
+
return f"{ctx_tokens // 1024}K"
|
| 386 |
+
return str(ctx_tokens)
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
def _render_performance(report: EvaluationReport, console: Console) -> None:
|
| 390 |
+
if (
|
| 391 |
+
report.prefill is None
|
| 392 |
+
or report.decode is None
|
| 393 |
+
or report.concurrency is None
|
| 394 |
+
or report.perf_input_tokens is None
|
| 395 |
+
or report.perf_target_tokens_per_sec is None
|
| 396 |
+
):
|
| 397 |
+
return
|
| 398 |
+
|
| 399 |
+
console.print()
|
| 400 |
+
# Assumption banner — surfaces the utilization factors, SLA, and
|
| 401 |
+
# degradation factor. Every number in the performance section depends
|
| 402 |
+
# on these.
|
| 403 |
+
assumptions = t(
|
| 404 |
+
"perf.assumptions_note",
|
| 405 |
+
input_tokens=report.perf_input_tokens,
|
| 406 |
+
output_tokens=report.perf_output_tokens,
|
| 407 |
+
target_tps=report.perf_target_tokens_per_sec,
|
| 408 |
+
prefill_util=report.prefill.utilization,
|
| 409 |
+
decode_util=report.decode.bw_utilization,
|
| 410 |
+
degradation=report.concurrency.degradation_factor,
|
| 411 |
+
)
|
| 412 |
+
console.print(f"[dim italic]{assumptions}[/dim italic]")
|
| 413 |
+
|
| 414 |
+
table = Table(
|
| 415 |
+
title=t("section.performance"),
|
| 416 |
+
title_justify="left",
|
| 417 |
+
show_header=False,
|
| 418 |
+
box=None,
|
| 419 |
+
padding=(0, 2),
|
| 420 |
+
)
|
| 421 |
+
table.add_column("field", style="dim")
|
| 422 |
+
table.add_column("value")
|
| 423 |
+
table.add_column("label")
|
| 424 |
+
|
| 425 |
+
p = report.prefill
|
| 426 |
+
d = report.decode
|
| 427 |
+
c = report.concurrency
|
| 428 |
+
|
| 429 |
+
table.add_row(
|
| 430 |
+
t("perf.prefill_latency"),
|
| 431 |
+
f"{p.latency_ms.value:.1f} ms",
|
| 432 |
+
format_tag(p.latency_ms),
|
| 433 |
+
)
|
| 434 |
+
table.add_row(
|
| 435 |
+
t("perf.decode_throughput_per_gpu"),
|
| 436 |
+
f"{d.per_gpu_tokens_per_sec.value:.1f} tok/s",
|
| 437 |
+
format_tag(d.per_gpu_tokens_per_sec),
|
| 438 |
+
)
|
| 439 |
+
table.add_row(
|
| 440 |
+
t("perf.decode_throughput_cluster"),
|
| 441 |
+
f"{d.cluster_tokens_per_sec.value:.1f} tok/s",
|
| 442 |
+
format_tag(d.cluster_tokens_per_sec),
|
| 443 |
+
)
|
| 444 |
+
if d.moe_active_tokens_per_sec is not None:
|
| 445 |
+
table.add_row(
|
| 446 |
+
t("perf.decode_moe_active_optimistic"),
|
| 447 |
+
f"{d.moe_active_tokens_per_sec.value:.1f} tok/s",
|
| 448 |
+
format_tag(d.moe_active_tokens_per_sec),
|
| 449 |
+
)
|
| 450 |
+
table.add_row(
|
| 451 |
+
t("perf.k_bound"),
|
| 452 |
+
str(c.k_bound.value),
|
| 453 |
+
format_tag(c.k_bound),
|
| 454 |
+
)
|
| 455 |
+
table.add_row(
|
| 456 |
+
t("perf.l_bound"),
|
| 457 |
+
str(c.l_bound.value),
|
| 458 |
+
format_tag(c.l_bound),
|
| 459 |
+
)
|
| 460 |
+
table.add_row(
|
| 461 |
+
t("perf.max_concurrent"),
|
| 462 |
+
str(c.max_concurrent.value),
|
| 463 |
+
format_tag(c.max_concurrent),
|
| 464 |
+
)
|
| 465 |
+
bottleneck_label = t(f"perf.bottleneck.{c.bottleneck}")
|
| 466 |
+
locale = get_locale()
|
| 467 |
+
reason = c.bottleneck_reason_zh if locale == "zh" else c.bottleneck_reason_en
|
| 468 |
+
table.add_row(
|
| 469 |
+
t("perf.bottleneck"),
|
| 470 |
+
f"{bottleneck_label} — {reason}",
|
| 471 |
+
Text(""),
|
| 472 |
+
)
|
| 473 |
+
console.print(table)
|
| 474 |
+
|
| 475 |
+
# Always show a short optimization list. Rules are currently static but
|
| 476 |
+
# future versions can pick per bottleneck type.
|
| 477 |
+
console.print(f"[bold]{t('perf.optimization.header')}:[/bold]")
|
| 478 |
+
for key in (
|
| 479 |
+
"perf.opt.quantize_int4",
|
| 480 |
+
"perf.opt.relax_sla",
|
| 481 |
+
"perf.opt.kv_fp8",
|
| 482 |
+
"perf.opt.moe_offload",
|
| 483 |
+
):
|
| 484 |
+
console.print(f" • {t(key)}")
|
| 485 |
+
|
| 486 |
+
|
| 487 |
+
def _render_command(report: EvaluationReport, console: Console) -> None:
|
| 488 |
+
if not report.generated_command or report.fleet is None:
|
| 489 |
+
return
|
| 490 |
+
# Figure out which tier we emitted the command for.
|
| 491 |
+
best_tier_opt = next(
|
| 492 |
+
(o for o in report.fleet.options if o.tier == report.fleet.best_tier),
|
| 493 |
+
report.fleet.options[0],
|
| 494 |
+
)
|
| 495 |
+
tier_label = t(f"fleet.tier.{best_tier_opt.tier}")
|
| 496 |
+
header_note = t("command.tier_note", tier=tier_label, gpus=best_tier_opt.gpu_count)
|
| 497 |
+
console.print()
|
| 498 |
+
console.print(
|
| 499 |
+
Panel(
|
| 500 |
+
report.generated_command,
|
| 501 |
+
title=f"{t('section.command')} — {header_note}",
|
| 502 |
+
title_align="left",
|
| 503 |
+
border_style="green",
|
| 504 |
+
)
|
| 505 |
+
)
|
| 506 |
+
|
| 507 |
+
|
| 508 |
+
def _render_label_legend(console: Console) -> None:
|
| 509 |
+
legend = Text()
|
| 510 |
+
legend.append(f"{t('section.labels')} ", style="dim")
|
| 511 |
+
for label in Label:
|
| 512 |
+
display = t(f"label.{label.value}")
|
| 513 |
+
legend.append(f"[{display}] ", style=_LABEL_STYLES.get(label, "white"))
|
| 514 |
+
console.print(legend)
|
| 515 |
+
|
| 516 |
+
|
| 517 |
+
def _verified_tag() -> Text:
|
| 518 |
+
return Text(f"[{t('label.verified')}]", style=_LABEL_STYLES[Label.VERIFIED])
|
| 519 |
+
|
| 520 |
+
|
| 521 |
+
def render_llm_review(result: Any, console: Console | None = None) -> None:
|
| 522 |
+
"""Render --llm-review block. Accepts an LLMReviewResult.
|
| 523 |
+
|
| 524 |
+
Failure is non-fatal — shows setup hint and continues.
|
| 525 |
+
"""
|
| 526 |
+
console = console or Console()
|
| 527 |
+
console.print()
|
| 528 |
+
console.print(Panel.fit(t("section.llm_review"), border_style="magenta"))
|
| 529 |
+
|
| 530 |
+
if not result.ok:
|
| 531 |
+
msg = t("llm_review.unavailable", error=result.error or "unknown")
|
| 532 |
+
console.print(f"[yellow]{msg}[/yellow]")
|
| 533 |
+
console.print(f"[dim]{t('llm_review.setup_hint')}[/dim]")
|
| 534 |
+
return
|
| 535 |
+
|
| 536 |
+
# Disclaimer first — make it visually distinctive so users don't confuse
|
| 537 |
+
# LLM opinion with the tool's own output.
|
| 538 |
+
disclaimer = t("llm_review.disclaimer", model=result.model, base_url=result.base_url)
|
| 539 |
+
console.print(f"[bold yellow]{disclaimer}[/bold yellow]")
|
| 540 |
+
console.print()
|
| 541 |
+
# The actual review, prefixed with the [llm-opinion] tag so users see
|
| 542 |
+
# it's tagged too.
|
| 543 |
+
tag_style = _LABEL_STYLES[Label.LLM_OPINION]
|
| 544 |
+
tag_display = t(f"label.{Label.LLM_OPINION.value}")
|
| 545 |
+
console.print(f"[{tag_style}][{tag_display}][/{tag_style}]")
|
| 546 |
+
# Print content verbatim (LLM output is markdown-ish; let it through).
|
| 547 |
+
console.print(result.content or "")
|
| 548 |
+
|
| 549 |
+
|
| 550 |
+
def render_explain(entries: list[Any], console: Console | None = None) -> None:
|
| 551 |
+
"""Render `--explain` block: full derivation trace for each number.
|
| 552 |
+
|
| 553 |
+
`entries` is a list of `core.explain.ExplainEntry`.
|
| 554 |
+
"""
|
| 555 |
+
console = console or Console()
|
| 556 |
+
|
| 557 |
+
console.print()
|
| 558 |
+
console.print(Panel.fit(t("section.explain"), border_style="magenta"))
|
| 559 |
+
console.print(f"[dim italic]{t('explain.intro')}[/dim italic]")
|
| 560 |
+
console.print()
|
| 561 |
+
|
| 562 |
+
for entry in entries:
|
| 563 |
+
# Title bar per entry
|
| 564 |
+
console.print(Panel.fit(f"[bold]{entry.heading}[/bold]", border_style="cyan"))
|
| 565 |
+
|
| 566 |
+
# Formula (monospace)
|
| 567 |
+
console.print(f"[bold]{t('explain.formula')}:[/bold]")
|
| 568 |
+
for line in entry.formula.splitlines():
|
| 569 |
+
console.print(f" [magenta]{line}[/magenta]")
|
| 570 |
+
|
| 571 |
+
# Inputs
|
| 572 |
+
if entry.inputs:
|
| 573 |
+
console.print(f"[bold]{t('explain.inputs')}:[/bold]")
|
| 574 |
+
for inp in entry.inputs:
|
| 575 |
+
note = f" [dim]({inp.note})[/dim]" if inp.note else ""
|
| 576 |
+
console.print(
|
| 577 |
+
f" [cyan]{inp.name}[/cyan] = {inp.value} [dim]{inp.label}[/dim]{note}"
|
| 578 |
+
)
|
| 579 |
+
|
| 580 |
+
# Steps
|
| 581 |
+
if entry.steps:
|
| 582 |
+
console.print(f"[bold]{t('explain.steps')}:[/bold]")
|
| 583 |
+
for step in entry.steps:
|
| 584 |
+
for line in step.splitlines():
|
| 585 |
+
console.print(f" {line}")
|
| 586 |
+
|
| 587 |
+
# Result
|
| 588 |
+
console.print(f"[bold]{t('explain.result')}:[/bold] {entry.result}")
|
| 589 |
+
|
| 590 |
+
# Source + methodology anchor
|
| 591 |
+
if entry.source:
|
| 592 |
+
console.print(f"[bold]{t('explain.source')}:[/bold] {entry.source}")
|
| 593 |
+
if entry.methodology_anchor:
|
| 594 |
+
console.print(
|
| 595 |
+
f"[dim]{t('explain.see_also')}: docs/methodology.md{entry.methodology_anchor}[/dim]"
|
| 596 |
+
)
|
| 597 |
+
console.print()
|
| 598 |
+
|
| 599 |
+
|
| 600 |
+
def render_gpu_list(db: GPUDatabase, console: Console | None = None) -> None:
|
| 601 |
+
"""Print the supported-GPU table. Invoked by `llm-cal --list-gpus`."""
|
| 602 |
+
console = console or Console()
|
| 603 |
+
locale = get_locale()
|
| 604 |
+
|
| 605 |
+
table = Table(
|
| 606 |
+
title=t("gpus.list.title"),
|
| 607 |
+
title_justify="left",
|
| 608 |
+
show_header=True,
|
| 609 |
+
header_style="dim",
|
| 610 |
+
box=None,
|
| 611 |
+
padding=(0, 2),
|
| 612 |
+
)
|
| 613 |
+
table.add_column(t("gpus.col.id"))
|
| 614 |
+
table.add_column(t("gpus.col.memory"), justify="right")
|
| 615 |
+
table.add_column(t("gpus.col.nvlink"), justify="right")
|
| 616 |
+
table.add_column(t("gpus.col.fp16"), justify="right")
|
| 617 |
+
table.add_column(t("gpus.col.fp8"), justify="center")
|
| 618 |
+
table.add_column(t("gpus.col.fp4"), justify="center")
|
| 619 |
+
table.add_column(t("gpus.col.aliases"))
|
| 620 |
+
|
| 621 |
+
yes = t("hw.bool_yes")
|
| 622 |
+
no = t("hw.bool_no")
|
| 623 |
+
|
| 624 |
+
# Preserve YAML insertion order (vendors are grouped there).
|
| 625 |
+
for spec in db.gpus:
|
| 626 |
+
aliases_str = ", ".join(spec.aliases) if spec.aliases else "—"
|
| 627 |
+
nvlink_str = f"{spec.nvlink_bandwidth_gbps} GB/s" if spec.nvlink_bandwidth_gbps else "—"
|
| 628 |
+
table.add_row(
|
| 629 |
+
spec.id,
|
| 630 |
+
f"{spec.memory_gb} GB",
|
| 631 |
+
nvlink_str,
|
| 632 |
+
f"{spec.fp16_tflops:.0f}",
|
| 633 |
+
yes if spec.fp8_support else no,
|
| 634 |
+
yes if spec.fp4_support else no,
|
| 635 |
+
aliases_str,
|
| 636 |
+
)
|
| 637 |
+
console.print(table)
|
| 638 |
+
console.print(f"[dim]{t('gpus.total', count=len(db.gpus))}[/dim]")
|
| 639 |
+
_ = locale # suppress unused var warn until we add locale-dependent notes column
|
| 640 |
+
|
| 641 |
+
|
| 642 |
+
def _verif_label(entry: EngineCompatEntry) -> Text:
|
| 643 |
+
"""Engine compat rows use the same label vocabulary as AnnotatedValue."""
|
| 644 |
+
label = {
|
| 645 |
+
"verified": Label.VERIFIED,
|
| 646 |
+
"cited": Label.CITED,
|
| 647 |
+
"unverified": Label.UNVERIFIED,
|
| 648 |
+
}.get(entry.verification_level, Label.UNKNOWN)
|
| 649 |
+
return Text(f"[{t(f'label.{label.value}')}]", style=_LABEL_STYLES.get(label, "white"))
|
| 650 |
+
|
| 651 |
+
|
| 652 |
+
def _fmt_flag(f: EngineFlag) -> str:
|
| 653 |
+
if f.value is None:
|
| 654 |
+
return f.flag
|
| 655 |
+
return f"{f.flag} {f.value}"
|
| 656 |
+
|
| 657 |
+
|
| 658 |
+
def _fmt_source(s: EngineSource) -> str:
|
| 659 |
+
label = t(f"source.{s.type}")
|
| 660 |
+
if s.type == "tested":
|
| 661 |
+
return f"[{label}] {s.tester} @ {s.hardware} ({s.date})"
|
| 662 |
+
if s.url:
|
| 663 |
+
captured = f" ({t('source.captured_on')} {s.captured_date})" if s.captured_date else ""
|
| 664 |
+
return f"[{label}] {s.url}{captured}"
|
| 665 |
+
return f"[{label}]"
|
src/llm_cal/output/labels.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""6-level label discipline — the soul of the tool.
|
| 2 |
+
|
| 3 |
+
Every number in the output must be wrapped in `AnnotatedValue` so users always know
|
| 4 |
+
where a value came from. Using `StrEnum` (not bare strings) means typos are caught by
|
| 5 |
+
mypy/ruff, not by users.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
from dataclasses import dataclass
|
| 11 |
+
from enum import StrEnum
|
| 12 |
+
from typing import Generic, TypeVar
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class Label(StrEnum):
|
| 16 |
+
VERIFIED = "verified"
|
| 17 |
+
INFERRED = "inferred"
|
| 18 |
+
ESTIMATED = "estimated"
|
| 19 |
+
CITED = "cited"
|
| 20 |
+
UNVERIFIED = "unverified"
|
| 21 |
+
UNKNOWN = "unknown"
|
| 22 |
+
# Experimental opt-in 7th level. Populated only when --llm-review is used.
|
| 23 |
+
# Never overrides the first 6 — it's an external second opinion, not truth.
|
| 24 |
+
LLM_OPINION = "llm-opinion"
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
T = TypeVar("T")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@dataclass(frozen=True)
|
| 31 |
+
class AnnotatedValue(Generic[T]):
|
| 32 |
+
"""A value paired with provenance metadata.
|
| 33 |
+
|
| 34 |
+
Examples:
|
| 35 |
+
AnnotatedValue(160_300_000_000, Label.VERIFIED, source="HF model_info.siblings")
|
| 36 |
+
AnnotatedValue(4.52, Label.INFERRED, source="160.3 GB / 284B params")
|
| 37 |
+
AnnotatedValue(2_600_000_000, Label.ESTIMATED,
|
| 38 |
+
source="compress_ratios=[0,0,4,128,...] at 128K ctx")
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
value: T
|
| 42 |
+
label: Label
|
| 43 |
+
source: str | None = None
|
| 44 |
+
|
| 45 |
+
def render_tag(self) -> str:
|
| 46 |
+
return f"[{self.label.value}]"
|
src/llm_cal/performance/__init__.py
ADDED
|
File without changes
|
src/llm_cal/performance/compute.py
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Performance modeling for prefill latency and decode throughput.
|
| 2 |
+
|
| 3 |
+
FORMULAS — with sources. See docs/methodology.md for the full audit.
|
| 4 |
+
|
| 5 |
+
Prefill (compute-bound):
|
| 6 |
+
FLOPs = 2 × params × input_tokens
|
| 7 |
+
latency = FLOPs / (peak_TFLOPS × num_gpus × utilization × 1e12)
|
| 8 |
+
|
| 9 |
+
Source: Kaplan et al. 2020, "Scaling Laws for Neural Language Models".
|
| 10 |
+
The "2" factor is the forward-pass cost per param per token, a standard
|
| 11 |
+
approximation in transformer inference literature.
|
| 12 |
+
|
| 13 |
+
Decode (memory-bandwidth-bound):
|
| 14 |
+
per_token_time = weight_bytes_per_gpu / (memory_bandwidth × utilization)
|
| 15 |
+
tokens_per_second = memory_bandwidth × utilization / weight_bytes_per_gpu
|
| 16 |
+
|
| 17 |
+
Source: Kwon et al. SOSP 2023 "Efficient Memory Management for Large
|
| 18 |
+
Language Model Serving with PagedAttention"; NVIDIA "Mastering LLM
|
| 19 |
+
Techniques: Inference Optimization" (2023 technical blog).
|
| 20 |
+
|
| 21 |
+
UTILIZATION FACTORS (all empirical, ALL user-overridable):
|
| 22 |
+
- Prefill 40% — midpoint of vLLM-reported 30-50% MFU on H100
|
| 23 |
+
- Decode BW 50% — midpoint of NVIDIA/vLLM-reported 40-65% achieved bandwidth
|
| 24 |
+
- Cluster comm 90% — typical NCCL AllReduce efficiency at TP=8 on NVLink
|
| 25 |
+
- Concurrency degradation 1.0 (no degradation by default)
|
| 26 |
+
This is the most uncertain factor. Prior versions defaulted to 1.5
|
| 27 |
+
(borrowed from an LLM-generated report), which was NOT from a primary
|
| 28 |
+
source. v0.1 defaults to 1.0 (honest baseline) and exposes the knob
|
| 29 |
+
so users can dial in whatever their engine actually achieves.
|
| 30 |
+
|
| 31 |
+
MoE "active" vs "total":
|
| 32 |
+
Strictly, MoE decode only reads the active experts per token. The
|
| 33 |
+
ratio used here is a rough approximation:
|
| 34 |
+
active_ratio ≈ (experts_per_tok + shared_experts) / (routed + shared)
|
| 35 |
+
This UNDERESTIMATES active weight because attention + embeddings are
|
| 36 |
+
always active (not just experts). For a more accurate number, use the
|
| 37 |
+
model card's stated "total / active" figure if available. The
|
| 38 |
+
"active-only" throughput is labeled "optimistic" for this reason.
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
from __future__ import annotations
|
| 42 |
+
|
| 43 |
+
from dataclasses import dataclass
|
| 44 |
+
|
| 45 |
+
from llm_cal.architecture.profile import ArchitectureProfile
|
| 46 |
+
from llm_cal.hardware.loader import GPUSpec
|
| 47 |
+
from llm_cal.output.labels import AnnotatedValue, Label
|
| 48 |
+
|
| 49 |
+
# Empirical defaults. All user-overridable via CLI.
|
| 50 |
+
DEFAULT_PREFILL_UTILIZATION = 0.40
|
| 51 |
+
DEFAULT_DECODE_BW_UTILIZATION = 0.50
|
| 52 |
+
DEFAULT_CLUSTER_COMM_EFFICIENCY = 0.90
|
| 53 |
+
# Honest baseline. Previously 1.5, borrowed from an LLM-generated report —
|
| 54 |
+
# that had no primary source, so we reset to 1.0. Users who observe actual
|
| 55 |
+
# degradation on their engine should dial this up via CLI.
|
| 56 |
+
DEFAULT_CONCURRENCY_DEGRADATION = 1.0
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
@dataclass(frozen=True)
|
| 60 |
+
class PrefillEstimate:
|
| 61 |
+
total_flops: AnnotatedValue[int] # [estimated] 2 * params * input_tokens
|
| 62 |
+
peak_effective_tflops: AnnotatedValue[float] # TFLOPS × utilization
|
| 63 |
+
latency_ms: AnnotatedValue[float]
|
| 64 |
+
utilization: float # the factor used (for provenance)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
@dataclass(frozen=True)
|
| 68 |
+
class DecodeEstimate:
|
| 69 |
+
active_weight_bytes_per_gpu: AnnotatedValue[int]
|
| 70 |
+
per_gpu_tokens_per_sec: AnnotatedValue[float]
|
| 71 |
+
cluster_tokens_per_sec: AnnotatedValue[float] # after comm efficiency
|
| 72 |
+
bw_utilization: float
|
| 73 |
+
cluster_comm_efficiency: float
|
| 74 |
+
moe_active_weight_bytes_per_gpu: AnnotatedValue[int] | None = None
|
| 75 |
+
moe_active_tokens_per_sec: AnnotatedValue[float] | None = None
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def estimate_prefill(
|
| 79 |
+
profile: ArchitectureProfile,
|
| 80 |
+
total_params: int,
|
| 81 |
+
gpu: GPUSpec,
|
| 82 |
+
num_gpus: int,
|
| 83 |
+
input_tokens: int,
|
| 84 |
+
utilization: float = DEFAULT_PREFILL_UTILIZATION,
|
| 85 |
+
) -> PrefillEstimate:
|
| 86 |
+
"""Estimate single-request prefill latency.
|
| 87 |
+
|
| 88 |
+
Based on compute: FLOPs = 2 × params × tokens; latency = FLOPs / effective_FLOPS.
|
| 89 |
+
"""
|
| 90 |
+
flops = 2 * total_params * input_tokens
|
| 91 |
+
# TP distributes compute, so aggregate TFLOPS = num_gpus × per-card × util
|
| 92 |
+
aggregate_tflops = gpu.fp16_tflops * num_gpus * utilization
|
| 93 |
+
# Guard against zero
|
| 94 |
+
if aggregate_tflops <= 0 or total_params <= 0 or input_tokens <= 0:
|
| 95 |
+
return PrefillEstimate(
|
| 96 |
+
total_flops=AnnotatedValue(0, Label.UNKNOWN, source="insufficient inputs"),
|
| 97 |
+
peak_effective_tflops=AnnotatedValue(0.0, Label.UNKNOWN),
|
| 98 |
+
latency_ms=AnnotatedValue(0.0, Label.UNKNOWN),
|
| 99 |
+
utilization=utilization,
|
| 100 |
+
)
|
| 101 |
+
latency_s = flops / (aggregate_tflops * 1e12)
|
| 102 |
+
latency_ms = latency_s * 1000.0
|
| 103 |
+
|
| 104 |
+
return PrefillEstimate(
|
| 105 |
+
total_flops=AnnotatedValue(
|
| 106 |
+
flops,
|
| 107 |
+
Label.ESTIMATED,
|
| 108 |
+
source=f"2 × {total_params:,} params × {input_tokens:,} tokens",
|
| 109 |
+
),
|
| 110 |
+
peak_effective_tflops=AnnotatedValue(
|
| 111 |
+
aggregate_tflops,
|
| 112 |
+
Label.ESTIMATED,
|
| 113 |
+
source=f"{gpu.fp16_tflops} × {num_gpus} GPUs × {utilization:.0%} util",
|
| 114 |
+
),
|
| 115 |
+
latency_ms=AnnotatedValue(
|
| 116 |
+
latency_ms,
|
| 117 |
+
Label.ESTIMATED,
|
| 118 |
+
source=(f"{flops:.2e} FLOPs / ({aggregate_tflops:.1f} effective TFLOPS × 1e12)"),
|
| 119 |
+
),
|
| 120 |
+
utilization=utilization,
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def _nvlink_efficiency(gpu: GPUSpec, num_gpus: int) -> float:
|
| 125 |
+
"""Multiplier on cluster comm efficiency reflecting NVLink bandwidth.
|
| 126 |
+
|
| 127 |
+
Single-GPU has no TP all-reduce, so no penalty. H100 / B200 / H200 / A100-
|
| 128 |
+
SXM4 with full NVLink (>=900 GB/s aggregate, dropped to 600 for A100) get
|
| 129 |
+
~1.0. Restricted-NVLink variants (H800: 400 GB/s, half of H100) pay ~8%.
|
| 130 |
+
PCIe-only cards (L40S, RTX) with no NVLink pay 20%.
|
| 131 |
+
"""
|
| 132 |
+
if num_gpus <= 1:
|
| 133 |
+
return 1.0
|
| 134 |
+
nvlink = gpu.nvlink_bandwidth_gbps or 0
|
| 135 |
+
if nvlink >= 900:
|
| 136 |
+
return 1.0
|
| 137 |
+
if nvlink <= 0:
|
| 138 |
+
return 0.80
|
| 139 |
+
return 0.85 + 0.15 * (nvlink / 900.0)
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def estimate_decode(
|
| 143 |
+
profile: ArchitectureProfile,
|
| 144 |
+
total_weight_bytes: int,
|
| 145 |
+
gpu: GPUSpec,
|
| 146 |
+
num_gpus: int,
|
| 147 |
+
bw_utilization: float = DEFAULT_DECODE_BW_UTILIZATION,
|
| 148 |
+
cluster_comm_efficiency: float = DEFAULT_CLUSTER_COMM_EFFICIENCY,
|
| 149 |
+
moe_active_params_ratio: float | None = None,
|
| 150 |
+
) -> DecodeEstimate:
|
| 151 |
+
"""Estimate decode tokens/second.
|
| 152 |
+
|
| 153 |
+
Decode is memory-bandwidth-bound: per-token time = weight_bytes / bw.
|
| 154 |
+
Under TP, weights split across ranks, so per-GPU weight bytes = total / N.
|
| 155 |
+
|
| 156 |
+
If the model is MoE and moe_active_params_ratio is given (e.g. 0.3 for
|
| 157 |
+
active/total), we ALSO report an optimistic "active only" throughput.
|
| 158 |
+
"""
|
| 159 |
+
if gpu.memory_bandwidth_gbps is None or gpu.memory_bandwidth_gbps <= 0:
|
| 160 |
+
_unknown = AnnotatedValue(
|
| 161 |
+
0, Label.UNKNOWN, source="GPU memory_bandwidth_gbps not in database"
|
| 162 |
+
)
|
| 163 |
+
_unknown_f = AnnotatedValue(
|
| 164 |
+
0.0, Label.UNKNOWN, source="GPU memory_bandwidth_gbps not in database"
|
| 165 |
+
)
|
| 166 |
+
return DecodeEstimate(
|
| 167 |
+
active_weight_bytes_per_gpu=_unknown,
|
| 168 |
+
per_gpu_tokens_per_sec=_unknown_f,
|
| 169 |
+
cluster_tokens_per_sec=_unknown_f,
|
| 170 |
+
bw_utilization=bw_utilization,
|
| 171 |
+
cluster_comm_efficiency=cluster_comm_efficiency,
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
bw_bytes_per_s = gpu.memory_bandwidth_gbps * 1e9 # GB/s → bytes/s
|
| 175 |
+
effective_bw = bw_bytes_per_s * bw_utilization
|
| 176 |
+
weight_per_gpu = max(1, total_weight_bytes // num_gpus)
|
| 177 |
+
per_gpu_tps = effective_bw / weight_per_gpu
|
| 178 |
+
# Cluster-level: per-GPU × N × comm_efficiency × NVLink-aware penalty.
|
| 179 |
+
# NVLink penalty captures TP all-reduce overhead on cards with restricted
|
| 180 |
+
# interconnect (H800, PCIe-only). Single-GPU is unaffected.
|
| 181 |
+
nvlink_eff = _nvlink_efficiency(gpu, num_gpus)
|
| 182 |
+
effective_comm_eff = cluster_comm_efficiency * nvlink_eff
|
| 183 |
+
cluster_tps = per_gpu_tps * num_gpus * effective_comm_eff
|
| 184 |
+
|
| 185 |
+
# MoE active-only optimistic view
|
| 186 |
+
moe_active_weight: AnnotatedValue[int] | None = None
|
| 187 |
+
moe_active_tps: AnnotatedValue[float] | None = None
|
| 188 |
+
if profile.is_moe and moe_active_params_ratio is not None and moe_active_params_ratio > 0:
|
| 189 |
+
active_bytes = int(weight_per_gpu * moe_active_params_ratio)
|
| 190 |
+
moe_active_weight = AnnotatedValue(
|
| 191 |
+
active_bytes,
|
| 192 |
+
Label.ESTIMATED,
|
| 193 |
+
source=f"{weight_per_gpu:,} × {moe_active_params_ratio:.3f} (active/total ratio)",
|
| 194 |
+
)
|
| 195 |
+
if active_bytes > 0:
|
| 196 |
+
active_per_gpu_tps = effective_bw / active_bytes
|
| 197 |
+
active_cluster_tps = active_per_gpu_tps * num_gpus * effective_comm_eff
|
| 198 |
+
moe_active_tps = AnnotatedValue(
|
| 199 |
+
active_cluster_tps,
|
| 200 |
+
Label.ESTIMATED,
|
| 201 |
+
source=(
|
| 202 |
+
f"optimistic MoE active-only: effective_bw / {active_bytes:,} × "
|
| 203 |
+
f"{num_gpus} × {effective_comm_eff:.3f}"
|
| 204 |
+
),
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
return DecodeEstimate(
|
| 208 |
+
active_weight_bytes_per_gpu=AnnotatedValue(
|
| 209 |
+
weight_per_gpu,
|
| 210 |
+
Label.ESTIMATED,
|
| 211 |
+
source=f"{total_weight_bytes:,} bytes / {num_gpus} TP ranks",
|
| 212 |
+
),
|
| 213 |
+
per_gpu_tokens_per_sec=AnnotatedValue(
|
| 214 |
+
per_gpu_tps,
|
| 215 |
+
Label.ESTIMATED,
|
| 216 |
+
source=(
|
| 217 |
+
f"{gpu.memory_bandwidth_gbps} GB/s × {bw_utilization:.0%} util / "
|
| 218 |
+
f"{weight_per_gpu:,} weight bytes"
|
| 219 |
+
),
|
| 220 |
+
),
|
| 221 |
+
cluster_tokens_per_sec=AnnotatedValue(
|
| 222 |
+
cluster_tps,
|
| 223 |
+
Label.ESTIMATED,
|
| 224 |
+
source=(
|
| 225 |
+
f"per-GPU × {num_gpus} GPUs × {cluster_comm_efficiency:.0%} comm × "
|
| 226 |
+
f"{nvlink_eff:.3f} NVLink penalty (NVLink={gpu.nvlink_bandwidth_gbps or 0} GB/s)"
|
| 227 |
+
),
|
| 228 |
+
),
|
| 229 |
+
bw_utilization=bw_utilization,
|
| 230 |
+
cluster_comm_efficiency=cluster_comm_efficiency,
|
| 231 |
+
moe_active_weight_bytes_per_gpu=moe_active_weight,
|
| 232 |
+
moe_active_tokens_per_sec=moe_active_tps,
|
| 233 |
+
)
|
src/llm_cal/performance/concurrency.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Dual-bound concurrency analysis + bottleneck classification.
|
| 2 |
+
|
| 3 |
+
Models two concurrency ceilings:
|
| 4 |
+
K = memory-capacity bound
|
| 5 |
+
(usable GPU memory ÷ per-request KV cache)
|
| 6 |
+
L = compute/bandwidth bound at a given SLA
|
| 7 |
+
(cluster decode throughput ÷ target per-user tokens/sec ÷ degradation)
|
| 8 |
+
|
| 9 |
+
Max concurrent = min(K, L). Whichever is smaller names the bottleneck.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import math
|
| 15 |
+
from dataclasses import dataclass
|
| 16 |
+
from typing import Literal
|
| 17 |
+
|
| 18 |
+
from llm_cal.output.labels import AnnotatedValue, Label
|
| 19 |
+
from llm_cal.performance.compute import (
|
| 20 |
+
DEFAULT_CONCURRENCY_DEGRADATION,
|
| 21 |
+
DecodeEstimate,
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
Bottleneck = Literal[
|
| 25 |
+
"memory_capacity",
|
| 26 |
+
"memory_bandwidth",
|
| 27 |
+
"compute",
|
| 28 |
+
"insufficient_data",
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
@dataclass(frozen=True)
|
| 33 |
+
class ConcurrencyAnalysis:
|
| 34 |
+
# K bound
|
| 35 |
+
k_bound: AnnotatedValue[int]
|
| 36 |
+
k_source_headroom_bytes: int
|
| 37 |
+
k_source_kv_per_req_bytes: int
|
| 38 |
+
# L bound
|
| 39 |
+
l_bound: AnnotatedValue[int]
|
| 40 |
+
target_tokens_per_sec: float
|
| 41 |
+
degradation_factor: float
|
| 42 |
+
# Verdict
|
| 43 |
+
max_concurrent: AnnotatedValue[int]
|
| 44 |
+
bottleneck: Bottleneck
|
| 45 |
+
bottleneck_reason_en: str
|
| 46 |
+
bottleneck_reason_zh: str
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def analyze(
|
| 50 |
+
*,
|
| 51 |
+
cluster_headroom_bytes: int, # total KV headroom across all GPUs at ref context
|
| 52 |
+
kv_bytes_per_request: int, # single-request KV cache at ref context
|
| 53 |
+
decode: DecodeEstimate,
|
| 54 |
+
target_tokens_per_sec: float,
|
| 55 |
+
degradation: float = DEFAULT_CONCURRENCY_DEGRADATION,
|
| 56 |
+
) -> ConcurrencyAnalysis:
|
| 57 |
+
"""Compute K and L bounds and pick the tighter one.
|
| 58 |
+
|
| 59 |
+
`cluster_headroom_bytes` and `kv_bytes_per_request` should be pre-adjusted
|
| 60 |
+
for TP sharding (see fleet planner for the same rule).
|
| 61 |
+
"""
|
| 62 |
+
# K: how many requests fit in KV memory
|
| 63 |
+
if kv_bytes_per_request <= 0:
|
| 64 |
+
k = 0
|
| 65 |
+
k_label = Label.UNKNOWN
|
| 66 |
+
k_source = "KV cache per request is zero or unknown"
|
| 67 |
+
else:
|
| 68 |
+
k = max(0, math.floor(cluster_headroom_bytes / kv_bytes_per_request))
|
| 69 |
+
k_label = Label.ESTIMATED
|
| 70 |
+
k_source = (
|
| 71 |
+
f"{cluster_headroom_bytes:,} bytes headroom / "
|
| 72 |
+
f"{kv_bytes_per_request:,} bytes per request"
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
# L: how many concurrent users can maintain target tokens/sec
|
| 76 |
+
cluster_tps = decode.cluster_tokens_per_sec.value
|
| 77 |
+
if cluster_tps <= 0 or target_tokens_per_sec <= 0 or degradation <= 0:
|
| 78 |
+
l_bound = 0
|
| 79 |
+
l_label = Label.UNKNOWN
|
| 80 |
+
l_source = "cluster throughput or target is zero / unknown"
|
| 81 |
+
else:
|
| 82 |
+
l_bound = max(0, math.floor(cluster_tps / target_tokens_per_sec / degradation))
|
| 83 |
+
l_label = Label.ESTIMATED
|
| 84 |
+
l_source = (
|
| 85 |
+
f"{cluster_tps:.1f} tok/s cluster / "
|
| 86 |
+
f"{target_tokens_per_sec:.1f} target / {degradation:.2f} degradation"
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
# Pick the tighter bound
|
| 90 |
+
if k == 0 and l_bound == 0:
|
| 91 |
+
max_n = 0
|
| 92 |
+
bottleneck: Bottleneck = "insufficient_data"
|
| 93 |
+
reason_en = "Both K and L unknown — cannot conclude."
|
| 94 |
+
reason_zh = "K 和 L 均未知,无法得出结论。"
|
| 95 |
+
elif k <= l_bound:
|
| 96 |
+
max_n = k
|
| 97 |
+
bottleneck = "memory_capacity"
|
| 98 |
+
reason_en = (
|
| 99 |
+
f"K ({k}) ≤ L ({l_bound}) → memory-capacity bound. "
|
| 100 |
+
"KV cache exhausts GPU headroom before throughput SLA does."
|
| 101 |
+
)
|
| 102 |
+
reason_zh = (
|
| 103 |
+
f"K ({k}) ≤ L ({l_bound}) → 显存容量瓶颈。先达到 KV cache 容量上限,才到吞吐目标。"
|
| 104 |
+
)
|
| 105 |
+
else:
|
| 106 |
+
max_n = l_bound
|
| 107 |
+
# Whether it's "compute" or "bandwidth" depends on where decode is bound.
|
| 108 |
+
# For v0.1 we just say "memory bandwidth / compute" since decode is
|
| 109 |
+
# bw-bound by default and the two share the same formula output.
|
| 110 |
+
bottleneck = "memory_bandwidth"
|
| 111 |
+
reason_en = (
|
| 112 |
+
f"L ({l_bound}) < K ({k}) → memory-bandwidth / compute bound. "
|
| 113 |
+
"Cluster can't sustain target tok/s per user at this concurrency."
|
| 114 |
+
)
|
| 115 |
+
reason_zh = f"L ({l_bound}) < K ({k}) → 带宽/算力瓶颈。集群在此并发下无法维持目标 tok/s。"
|
| 116 |
+
|
| 117 |
+
return ConcurrencyAnalysis(
|
| 118 |
+
k_bound=AnnotatedValue(k, k_label, source=k_source),
|
| 119 |
+
k_source_headroom_bytes=cluster_headroom_bytes,
|
| 120 |
+
k_source_kv_per_req_bytes=kv_bytes_per_request,
|
| 121 |
+
l_bound=AnnotatedValue(l_bound, l_label, source=l_source),
|
| 122 |
+
target_tokens_per_sec=target_tokens_per_sec,
|
| 123 |
+
degradation_factor=degradation,
|
| 124 |
+
max_concurrent=AnnotatedValue(
|
| 125 |
+
max_n,
|
| 126 |
+
Label.ESTIMATED if max_n > 0 else Label.UNKNOWN,
|
| 127 |
+
source=f"min(K={k}, L={l_bound})",
|
| 128 |
+
),
|
| 129 |
+
bottleneck=bottleneck,
|
| 130 |
+
bottleneck_reason_en=reason_en,
|
| 131 |
+
bottleneck_reason_zh=reason_zh,
|
| 132 |
+
)
|
src/llm_cal/weight_analyzer/__init__.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Weight analyzer — observed bytes + inferred quantization scheme.
|
| 2 |
+
|
| 3 |
+
Rules:
|
| 4 |
+
- `[verified]` — directly from HF/ModelScope API (sum of siblings[].size). Nothing else.
|
| 5 |
+
- `[inferred]` — any derivation, including bits/param and quantization guess.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
from dataclasses import dataclass
|
| 11 |
+
from typing import TYPE_CHECKING, Literal
|
| 12 |
+
|
| 13 |
+
from llm_cal.model_source.base import SiblingFile
|
| 14 |
+
from llm_cal.output.labels import AnnotatedValue, Label
|
| 15 |
+
|
| 16 |
+
if TYPE_CHECKING:
|
| 17 |
+
from llm_cal.weight_analyzer.fingerprint import QuantFingerprint
|
| 18 |
+
|
| 19 |
+
# Known byte-per-param values. bits/param = bpp * 8.
|
| 20 |
+
QuantizationScheme = Literal[
|
| 21 |
+
"FP16",
|
| 22 |
+
"BF16",
|
| 23 |
+
"FP8",
|
| 24 |
+
"INT8",
|
| 25 |
+
"FP4_FP8_MIXED", # DeepSeek-V4-Flash style
|
| 26 |
+
"INT4",
|
| 27 |
+
"GPTQ_INT4",
|
| 28 |
+
"AWQ_INT4",
|
| 29 |
+
"UNKNOWN",
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
# Rough bytes-per-param anchor points. Used by reconciler.
|
| 33 |
+
_QUANT_BPP: dict[QuantizationScheme, float] = {
|
| 34 |
+
"FP16": 2.00,
|
| 35 |
+
"BF16": 2.00,
|
| 36 |
+
"FP8": 1.00,
|
| 37 |
+
"INT8": 1.00,
|
| 38 |
+
"FP4_FP8_MIXED": 0.55, # DeepSeek V4 empirical (~4.5 bits/param)
|
| 39 |
+
"INT4": 0.50,
|
| 40 |
+
"GPTQ_INT4": 0.55, # +scale tensors overhead
|
| 41 |
+
"AWQ_INT4": 0.55,
|
| 42 |
+
"UNKNOWN": 0.0,
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
@dataclass(frozen=True)
|
| 47 |
+
class WeightReport:
|
| 48 |
+
"""Everything the weight analyzer can determine from files + params."""
|
| 49 |
+
|
| 50 |
+
total_bytes: AnnotatedValue[int] # [verified]
|
| 51 |
+
bits_per_param: AnnotatedValue[float] | None # [inferred]
|
| 52 |
+
quantization_guess: AnnotatedValue[QuantizationScheme] # [inferred]
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def _safetensors_total_bytes(siblings: tuple[SiblingFile, ...]) -> int:
|
| 56 |
+
"""Sum all *.safetensors file sizes. Ignores config, tokenizer, etc."""
|
| 57 |
+
return sum((s.size or 0) for s in siblings if s.filename.endswith(".safetensors"))
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def analyze(
|
| 61 |
+
siblings: tuple[SiblingFile, ...],
|
| 62 |
+
total_params: int | None,
|
| 63 |
+
fingerprint: QuantFingerprint | None = None,
|
| 64 |
+
) -> WeightReport:
|
| 65 |
+
"""Compute weight report from sibling files + param count.
|
| 66 |
+
|
| 67 |
+
`total_params` comes from summing across the architecture (computed elsewhere)
|
| 68 |
+
or is None if we couldn't determine it — in which case we skip the inference
|
| 69 |
+
step and return raw file size only.
|
| 70 |
+
|
| 71 |
+
`fingerprint` (optional) is authoritative evidence from config.json or
|
| 72 |
+
safetensors header. When present, it overrides the bpp nearest-match
|
| 73 |
+
heuristic for quantization_guess (VERIFIED instead of INFERRED).
|
| 74 |
+
"""
|
| 75 |
+
observed_bytes = _safetensors_total_bytes(siblings)
|
| 76 |
+
total_bytes = AnnotatedValue(
|
| 77 |
+
observed_bytes,
|
| 78 |
+
Label.VERIFIED,
|
| 79 |
+
source="sum of safetensors siblings from model_info API",
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
if not total_params or observed_bytes == 0:
|
| 83 |
+
return WeightReport(
|
| 84 |
+
total_bytes=total_bytes,
|
| 85 |
+
bits_per_param=None,
|
| 86 |
+
quantization_guess=AnnotatedValue(
|
| 87 |
+
"UNKNOWN",
|
| 88 |
+
Label.UNKNOWN,
|
| 89 |
+
source="total_params unknown or no safetensors files",
|
| 90 |
+
),
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
bpp = observed_bytes / total_params
|
| 94 |
+
bits_per_param = AnnotatedValue(
|
| 95 |
+
bpp * 8,
|
| 96 |
+
Label.INFERRED,
|
| 97 |
+
source=f"{observed_bytes} bytes / {total_params} params",
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
if fingerprint is not None:
|
| 101 |
+
quant: AnnotatedValue[QuantizationScheme] = AnnotatedValue(
|
| 102 |
+
fingerprint.scheme,
|
| 103 |
+
Label.VERIFIED,
|
| 104 |
+
source=fingerprint.evidence,
|
| 105 |
+
)
|
| 106 |
+
else:
|
| 107 |
+
quant = _guess_quantization(bpp)
|
| 108 |
+
|
| 109 |
+
return WeightReport(
|
| 110 |
+
total_bytes=total_bytes,
|
| 111 |
+
bits_per_param=bits_per_param,
|
| 112 |
+
quantization_guess=quant,
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def _guess_quantization(bpp: float) -> AnnotatedValue[QuantizationScheme]:
|
| 117 |
+
"""Nearest-match heuristic.
|
| 118 |
+
|
| 119 |
+
Tolerance ±0.10 bits/param for mixed-precision schemes (scale tensors,
|
| 120 |
+
FP16 embeddings, etc.); ±0.05 for pure schemes. See Success Criteria #2.
|
| 121 |
+
"""
|
| 122 |
+
# Ordered so closest anchor wins on ties
|
| 123 |
+
candidates: list[tuple[QuantizationScheme, float, float]] = [
|
| 124 |
+
("FP16", _QUANT_BPP["FP16"], 0.05),
|
| 125 |
+
("FP8", _QUANT_BPP["FP8"], 0.05),
|
| 126 |
+
("FP4_FP8_MIXED", _QUANT_BPP["FP4_FP8_MIXED"], 0.10),
|
| 127 |
+
("INT4", _QUANT_BPP["INT4"], 0.05),
|
| 128 |
+
("GPTQ_INT4", _QUANT_BPP["GPTQ_INT4"], 0.10),
|
| 129 |
+
]
|
| 130 |
+
best: tuple[QuantizationScheme, float] | None = None
|
| 131 |
+
for scheme, anchor_bpp, tolerance in candidates:
|
| 132 |
+
delta = abs(bpp - anchor_bpp)
|
| 133 |
+
if delta <= tolerance and (best is None or delta < best[1]):
|
| 134 |
+
best = (scheme, delta)
|
| 135 |
+
|
| 136 |
+
if best is None:
|
| 137 |
+
return AnnotatedValue(
|
| 138 |
+
"UNKNOWN",
|
| 139 |
+
Label.UNKNOWN,
|
| 140 |
+
source=f"bits/param {bpp * 8:.2f} does not match known schemes",
|
| 141 |
+
)
|
| 142 |
+
return AnnotatedValue(
|
| 143 |
+
best[0],
|
| 144 |
+
Label.INFERRED,
|
| 145 |
+
source=f"bits/param {bpp * 8:.2f} within tolerance of {best[0]}",
|
| 146 |
+
)
|
src/llm_cal/weight_analyzer/fingerprint.py
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Quantization fingerprinting — tie-breakers for the reconciler.
|
| 2 |
+
|
| 3 |
+
When `reconciler.reconcile` has multiple schemes tied at the same bits/param
|
| 4 |
+
(FP4_FP8_MIXED, GPTQ_INT4, and AWQ_INT4 all sit at bpp=0.55), bytes alone
|
| 5 |
+
cannot pick a winner. We resolve the ambiguity with two stronger signals:
|
| 6 |
+
|
| 7 |
+
1. `quantization_config` in config.json — explicit declaration by the model
|
| 8 |
+
author. Covers most GPTQ/AWQ/FP8 community uploads.
|
| 9 |
+
|
| 10 |
+
2. safetensors per-tensor dtype + tensor-name patterns — the ground truth.
|
| 11 |
+
Covers models like DeepSeek-V4-Flash that use custom mixed-precision
|
| 12 |
+
packs without a config.json declaration.
|
| 13 |
+
|
| 14 |
+
Both return a `QuantFingerprint`. The reconciler uses the fingerprint's
|
| 15 |
+
`scheme` as a tie-breaker, and the `evidence` string flows into the
|
| 16 |
+
derivation trace.
|
| 17 |
+
|
| 18 |
+
This module is pure — no network, no file I/O. `safetensors_reader.py`
|
| 19 |
+
handles fetching; this module interprets what was fetched.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
from __future__ import annotations
|
| 23 |
+
|
| 24 |
+
from dataclasses import dataclass
|
| 25 |
+
from typing import Any, Literal
|
| 26 |
+
|
| 27 |
+
from llm_cal.weight_analyzer import QuantizationScheme
|
| 28 |
+
|
| 29 |
+
SourceType = Literal["config_json", "safetensors_header"]
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
@dataclass(frozen=True)
|
| 33 |
+
class QuantFingerprint:
|
| 34 |
+
scheme: QuantizationScheme
|
| 35 |
+
source_type: SourceType
|
| 36 |
+
evidence: str # for the derivation trace
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
# ---------------------------------------------------------------------------
|
| 40 |
+
# Config.json: explicit quant_method declaration
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def from_config(config: dict[str, Any]) -> QuantFingerprint | None:
|
| 44 |
+
"""Read `config.json` `quantization_config` and map to a scheme.
|
| 45 |
+
|
| 46 |
+
Returns None if no `quantization_config` block exists (model either
|
| 47 |
+
unquantized in-config or uses a per-tensor pack without declaration).
|
| 48 |
+
"""
|
| 49 |
+
qc = config.get("quantization_config")
|
| 50 |
+
if not isinstance(qc, dict):
|
| 51 |
+
return None
|
| 52 |
+
|
| 53 |
+
quant_method = qc.get("quant_method")
|
| 54 |
+
bits = qc.get("bits")
|
| 55 |
+
weight_dtype = qc.get("weight_dtype")
|
| 56 |
+
|
| 57 |
+
# GPTQ family
|
| 58 |
+
if quant_method == "gptq":
|
| 59 |
+
if bits == 4:
|
| 60 |
+
return QuantFingerprint(
|
| 61 |
+
scheme="GPTQ_INT4",
|
| 62 |
+
source_type="config_json",
|
| 63 |
+
evidence="config.json quantization_config.quant_method=gptq, bits=4",
|
| 64 |
+
)
|
| 65 |
+
if bits == 8:
|
| 66 |
+
return QuantFingerprint(
|
| 67 |
+
scheme="INT8",
|
| 68 |
+
source_type="config_json",
|
| 69 |
+
evidence="config.json quantization_config.quant_method=gptq, bits=8",
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
# AWQ family
|
| 73 |
+
if quant_method == "awq" and bits == 4:
|
| 74 |
+
return QuantFingerprint(
|
| 75 |
+
scheme="AWQ_INT4",
|
| 76 |
+
source_type="config_json",
|
| 77 |
+
evidence="config.json quantization_config.quant_method=awq, bits=4",
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
# FP8 (native or compressed-tensors wrapping)
|
| 81 |
+
if quant_method == "fp8":
|
| 82 |
+
return QuantFingerprint(
|
| 83 |
+
scheme="FP8",
|
| 84 |
+
source_type="config_json",
|
| 85 |
+
evidence="config.json quantization_config.quant_method=fp8",
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
# compressed-tensors (RedHatAI etc.) — inspect inner weight dtype
|
| 89 |
+
if quant_method == "compressed-tensors":
|
| 90 |
+
# The config_groups.group_0.weights.type can be "float", "int", etc.
|
| 91 |
+
# and num_bits gives 4/8. For v0.1.2 we handle the two common cases.
|
| 92 |
+
groups = qc.get("config_groups") or {}
|
| 93 |
+
# Pick the first group; schemas with heterogeneous groups degrade
|
| 94 |
+
# gracefully to None (reconciler stays in tied state).
|
| 95 |
+
for g in groups.values():
|
| 96 |
+
if not isinstance(g, dict):
|
| 97 |
+
continue
|
| 98 |
+
weights = g.get("weights") or {}
|
| 99 |
+
num_bits = weights.get("num_bits")
|
| 100 |
+
wtype = weights.get("type")
|
| 101 |
+
if num_bits == 8 and wtype in ("float", "fp8"):
|
| 102 |
+
return QuantFingerprint(
|
| 103 |
+
scheme="FP8",
|
| 104 |
+
source_type="config_json",
|
| 105 |
+
evidence="config.json compressed-tensors group weights=fp8/8bit",
|
| 106 |
+
)
|
| 107 |
+
if num_bits == 8 and wtype == "int":
|
| 108 |
+
return QuantFingerprint(
|
| 109 |
+
scheme="INT8",
|
| 110 |
+
source_type="config_json",
|
| 111 |
+
evidence="config.json compressed-tensors group weights=int/8bit",
|
| 112 |
+
)
|
| 113 |
+
if num_bits == 4 and wtype == "int":
|
| 114 |
+
# Generic INT4 — don't claim GPTQ or AWQ without more evidence
|
| 115 |
+
return QuantFingerprint(
|
| 116 |
+
scheme="INT4",
|
| 117 |
+
source_type="config_json",
|
| 118 |
+
evidence="config.json compressed-tensors group weights=int/4bit",
|
| 119 |
+
)
|
| 120 |
+
break # first group only
|
| 121 |
+
|
| 122 |
+
# bitsandbytes — load_in_4bit / load_in_8bit flags
|
| 123 |
+
if quant_method == "bitsandbytes":
|
| 124 |
+
if qc.get("load_in_4bit"):
|
| 125 |
+
return QuantFingerprint(
|
| 126 |
+
scheme="INT4",
|
| 127 |
+
source_type="config_json",
|
| 128 |
+
evidence="config.json quant_method=bitsandbytes, load_in_4bit=true",
|
| 129 |
+
)
|
| 130 |
+
if qc.get("load_in_8bit"):
|
| 131 |
+
return QuantFingerprint(
|
| 132 |
+
scheme="INT8",
|
| 133 |
+
source_type="config_json",
|
| 134 |
+
evidence="config.json quant_method=bitsandbytes, load_in_8bit=true",
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
# Standalone weight_dtype (no nested groups — some custom loaders)
|
| 138 |
+
if weight_dtype in ("float8_e4m3fn", "float8_e5m2"):
|
| 139 |
+
return QuantFingerprint(
|
| 140 |
+
scheme="FP8",
|
| 141 |
+
source_type="config_json",
|
| 142 |
+
evidence=f"config.json quantization_config.weight_dtype={weight_dtype}",
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
return None
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
# ---------------------------------------------------------------------------
|
| 149 |
+
# Safetensors header: per-tensor dtype + tensor-name patterns
|
| 150 |
+
|
| 151 |
+
# safetensors dtype strings (from the format spec)
|
| 152 |
+
_FP8_DTYPES = frozenset({"F8_E4M3", "F8_E5M2"})
|
| 153 |
+
_FP4_DTYPES = frozenset({"F4_E2M1", "F4"}) # F4 is used by some toolchains
|
| 154 |
+
_FP16_DTYPES = frozenset({"F16"})
|
| 155 |
+
_BF16_DTYPES = frozenset({"BF16"})
|
| 156 |
+
_INT8_DTYPES = frozenset({"I8", "U8"})
|
| 157 |
+
# F8_E8M0 is the 8-bit shared-exponent scaling factor used by MX-format
|
| 158 |
+
# block-scaled quantization (MXFP4, MXFP8). Its presence alongside packed
|
| 159 |
+
# integer weights (I8) is the signature of FP4 weight packing.
|
| 160 |
+
_MX_SCALE_DTYPES = frozenset({"F8_E8M0"})
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def from_safetensors_dtypes(tensor_dtypes: dict[str, str]) -> QuantFingerprint | None:
|
| 164 |
+
"""Fingerprint from a parsed safetensors header (tensor_name -> dtype string).
|
| 165 |
+
|
| 166 |
+
Only considers "weight-like" tensors. Non-weight tensors (norms, biases,
|
| 167 |
+
embeddings, LayerNorm params) often stay in FP16/BF16 even in heavily
|
| 168 |
+
quantized models, so counting them directly would give a wrong picture.
|
| 169 |
+
"""
|
| 170 |
+
if not tensor_dtypes:
|
| 171 |
+
return None
|
| 172 |
+
|
| 173 |
+
names = set(tensor_dtypes.keys())
|
| 174 |
+
|
| 175 |
+
# ------------------------------------------------------------------
|
| 176 |
+
# Packed-int4 schemes first — they have distinctive tensor-name markers
|
| 177 |
+
# even though the underlying dtype is I32 (bit-packed).
|
| 178 |
+
|
| 179 |
+
has_qweight = any(n.endswith(".qweight") or n.endswith("_qweight") for n in names)
|
| 180 |
+
has_g_idx = any(n.endswith(".g_idx") or n.endswith("_g_idx") for n in names)
|
| 181 |
+
has_qzeros = any(n.endswith(".qzeros") or n.endswith("_qzeros") for n in names)
|
| 182 |
+
|
| 183 |
+
if has_qweight and has_g_idx:
|
| 184 |
+
return QuantFingerprint(
|
| 185 |
+
scheme="GPTQ_INT4",
|
| 186 |
+
source_type="safetensors_header",
|
| 187 |
+
evidence="safetensors header has .qweight + .g_idx tensors (GPTQ marker)",
|
| 188 |
+
)
|
| 189 |
+
if has_qweight and has_qzeros and not has_g_idx:
|
| 190 |
+
return QuantFingerprint(
|
| 191 |
+
scheme="AWQ_INT4",
|
| 192 |
+
source_type="safetensors_header",
|
| 193 |
+
evidence="safetensors header has .qweight + .qzeros, no .g_idx (AWQ marker)",
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
# ------------------------------------------------------------------
|
| 197 |
+
# Dtype histogram over weight-like tensors.
|
| 198 |
+
# Skip norms / biases / embeddings which typically don't get quantized.
|
| 199 |
+
|
| 200 |
+
def _is_weight_tensor(name: str) -> bool:
|
| 201 |
+
lname = name.lower()
|
| 202 |
+
if any(sub in lname for sub in (".norm", ".bias", "embed", "lm_head")):
|
| 203 |
+
return False
|
| 204 |
+
# Tensor names in transformer models usually contain "weight"
|
| 205 |
+
return "weight" in lname or lname.endswith(".w") or lname.endswith(".proj")
|
| 206 |
+
|
| 207 |
+
weight_dtypes: list[str] = [dt for n, dt in tensor_dtypes.items() if _is_weight_tensor(n)]
|
| 208 |
+
if not weight_dtypes:
|
| 209 |
+
# Fall back to all dtypes if the name heuristic found nothing
|
| 210 |
+
weight_dtypes = list(tensor_dtypes.values())
|
| 211 |
+
|
| 212 |
+
has_fp4 = any(dt in _FP4_DTYPES for dt in weight_dtypes)
|
| 213 |
+
has_fp8 = any(dt in _FP8_DTYPES for dt in weight_dtypes)
|
| 214 |
+
has_fp16 = any(dt in _FP16_DTYPES for dt in weight_dtypes)
|
| 215 |
+
has_bf16 = any(dt in _BF16_DTYPES for dt in weight_dtypes)
|
| 216 |
+
has_int8 = any(dt in _INT8_DTYPES for dt in weight_dtypes)
|
| 217 |
+
has_mx_scale = any(dt in _MX_SCALE_DTYPES for dt in tensor_dtypes.values())
|
| 218 |
+
|
| 219 |
+
# MX-format block-scaled quantization (DeepSeek-V4-Flash pattern):
|
| 220 |
+
# F8_E8M0 scale tensors + packed I8 weights, plus a layer of F8_E4M3 for
|
| 221 |
+
# the FP8 sub-pack. Detected via the scale-dtype signature.
|
| 222 |
+
if has_mx_scale and has_int8:
|
| 223 |
+
if has_fp8:
|
| 224 |
+
return QuantFingerprint(
|
| 225 |
+
scheme="FP4_FP8_MIXED",
|
| 226 |
+
source_type="safetensors_header",
|
| 227 |
+
evidence=(
|
| 228 |
+
f"safetensors header: F8_E8M0 scale tensors + "
|
| 229 |
+
f"{sum(dt in _INT8_DTYPES for dt in weight_dtypes)} packed-I8 "
|
| 230 |
+
f"(FP4) weights + "
|
| 231 |
+
f"{sum(dt in _FP8_DTYPES for dt in weight_dtypes)} FP8 weights — "
|
| 232 |
+
f"MX block-scaled mixed pack"
|
| 233 |
+
),
|
| 234 |
+
)
|
| 235 |
+
# MXFP4 only — nominally INT4 but with the MX scaling envelope
|
| 236 |
+
return QuantFingerprint(
|
| 237 |
+
scheme="FP4_FP8_MIXED", # closest existing scheme; bpp ≈ 0.55 anchor
|
| 238 |
+
source_type="safetensors_header",
|
| 239 |
+
evidence=(
|
| 240 |
+
f"safetensors header: F8_E8M0 scale tensors + "
|
| 241 |
+
f"{sum(dt in _INT8_DTYPES for dt in weight_dtypes)} packed-I8 "
|
| 242 |
+
f"(FP4) weights — MXFP4 block-scaled"
|
| 243 |
+
),
|
| 244 |
+
)
|
| 245 |
+
|
| 246 |
+
# Classic FP4 + FP8 mixed (older toolchains exposing F4 dtype directly)
|
| 247 |
+
if has_fp4 and has_fp8:
|
| 248 |
+
return QuantFingerprint(
|
| 249 |
+
scheme="FP4_FP8_MIXED",
|
| 250 |
+
source_type="safetensors_header",
|
| 251 |
+
evidence=(
|
| 252 |
+
f"safetensors header has both FP4 and FP8 weight tensors "
|
| 253 |
+
f"({sum(dt in _FP4_DTYPES for dt in weight_dtypes)} FP4, "
|
| 254 |
+
f"{sum(dt in _FP8_DTYPES for dt in weight_dtypes)} FP8)"
|
| 255 |
+
),
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
# Pure FP8 — every weight tensor is F8_E4M3 or F8_E5M2
|
| 259 |
+
if has_fp8 and not (has_fp4 or has_int8):
|
| 260 |
+
fp8_count = sum(dt in _FP8_DTYPES for dt in weight_dtypes)
|
| 261 |
+
return QuantFingerprint(
|
| 262 |
+
scheme="FP8",
|
| 263 |
+
source_type="safetensors_header",
|
| 264 |
+
evidence=f"safetensors header: {fp8_count}/{len(weight_dtypes)} weight tensors are FP8",
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
# Pure FP16
|
| 268 |
+
if has_fp16 and not (has_fp8 or has_fp4 or has_int8 or has_bf16):
|
| 269 |
+
return QuantFingerprint(
|
| 270 |
+
scheme="FP16",
|
| 271 |
+
source_type="safetensors_header",
|
| 272 |
+
evidence=f"safetensors header: all {len(weight_dtypes)} weight tensors are F16",
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
# Pure BF16
|
| 276 |
+
if has_bf16 and not (has_fp8 or has_fp4 or has_int8 or has_fp16):
|
| 277 |
+
return QuantFingerprint(
|
| 278 |
+
scheme="BF16",
|
| 279 |
+
source_type="safetensors_header",
|
| 280 |
+
evidence=f"safetensors header: all {len(weight_dtypes)} weight tensors are BF16",
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
# Pure INT8
|
| 284 |
+
if has_int8 and not (has_fp8 or has_fp4 or has_fp16 or has_bf16):
|
| 285 |
+
return QuantFingerprint(
|
| 286 |
+
scheme="INT8",
|
| 287 |
+
source_type="safetensors_header",
|
| 288 |
+
evidence=f"safetensors header: {len(weight_dtypes)} weight tensors are INT8",
|
| 289 |
+
)
|
| 290 |
+
|
| 291 |
+
# Mixed in a way we don't have a named scheme for — stay silent
|
| 292 |
+
return None
|
src/llm_cal/weight_analyzer/reconciler.py
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Reconciler — compare observed weight bytes vs computed under each quantization assumption.
|
| 2 |
+
|
| 3 |
+
This is the module that outputs the DeepSeek-V4-Flash story (Problem Evidence in design doc):
|
| 4 |
+
"gpu_poor says 285 GB (assumes pure FP8); we say 160 GB (observed bytes match FP4+FP8
|
| 5 |
+
pack hypothesis). Here's why."
|
| 6 |
+
|
| 7 |
+
Core value: makes the quantization inference step transparent. The user sees all
|
| 8 |
+
candidates considered, not just the winner.
|
| 9 |
+
|
| 10 |
+
When multiple schemes share the same bytes-per-param anchor (FP4_FP8_MIXED,
|
| 11 |
+
GPTQ_INT4, and AWQ_INT4 all sit at bpp=0.55), bytes alone cannot pick a winner.
|
| 12 |
+
Pass a `QuantFingerprint` from `fingerprint.from_config()` or
|
| 13 |
+
`fingerprint.from_safetensors_dtypes()` to break the tie with authoritative
|
| 14 |
+
evidence.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from __future__ import annotations
|
| 18 |
+
|
| 19 |
+
from dataclasses import dataclass
|
| 20 |
+
|
| 21 |
+
from llm_cal.output.labels import AnnotatedValue, Label
|
| 22 |
+
from llm_cal.weight_analyzer import _QUANT_BPP, QuantizationScheme
|
| 23 |
+
from llm_cal.weight_analyzer.fingerprint import QuantFingerprint
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@dataclass(frozen=True)
|
| 27 |
+
class ReconciliationCandidate:
|
| 28 |
+
scheme: QuantizationScheme
|
| 29 |
+
predicted_bytes: int
|
| 30 |
+
delta_bytes: int # observed - predicted (positive = observed is larger)
|
| 31 |
+
relative_error: float # |delta| / predicted
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
@dataclass(frozen=True)
|
| 35 |
+
class ReconciliationReport:
|
| 36 |
+
observed_bytes: int
|
| 37 |
+
total_params: int
|
| 38 |
+
candidates: tuple[ReconciliationCandidate, ...] # sorted by |relative_error| asc
|
| 39 |
+
best: AnnotatedValue[QuantizationScheme]
|
| 40 |
+
|
| 41 |
+
def summary_line(self) -> str:
|
| 42 |
+
"""One-liner for output formatter."""
|
| 43 |
+
if not self.candidates:
|
| 44 |
+
return f"{self.observed_bytes:,} bytes — no quantization candidates tested"
|
| 45 |
+
c = self.candidates[0]
|
| 46 |
+
return (
|
| 47 |
+
f"Observed {self.observed_bytes:,} bytes. "
|
| 48 |
+
f"Best match: {c.scheme} "
|
| 49 |
+
f"(predicts {c.predicted_bytes:,} bytes, "
|
| 50 |
+
f"{c.relative_error * 100:.1f}% error)"
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# Tolerance for tie detection — schemes within this relative-error delta of the
|
| 55 |
+
# winner are considered tied.
|
| 56 |
+
_TIE_THRESHOLD = 0.01
|
| 57 |
+
|
| 58 |
+
# Tolerance gate — if the closest candidate is off by more than this, call UNKNOWN.
|
| 59 |
+
_UNKNOWN_THRESHOLD = 0.15
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def reconcile(
|
| 63 |
+
observed_bytes: int,
|
| 64 |
+
total_params: int,
|
| 65 |
+
fingerprint: QuantFingerprint | None = None,
|
| 66 |
+
) -> ReconciliationReport:
|
| 67 |
+
"""Compare observed file bytes against every known quantization scheme.
|
| 68 |
+
|
| 69 |
+
Args:
|
| 70 |
+
observed_bytes: Sum of safetensors file sizes.
|
| 71 |
+
total_params: Estimated param count.
|
| 72 |
+
fingerprint: Optional authoritative evidence from config.json or
|
| 73 |
+
safetensors header. Breaks bpp ties and annotates the source.
|
| 74 |
+
|
| 75 |
+
Returns full ranking so the formatter can show "gpu_poor would say X; we say Y."
|
| 76 |
+
"""
|
| 77 |
+
if observed_bytes == 0 or total_params == 0:
|
| 78 |
+
return ReconciliationReport(
|
| 79 |
+
observed_bytes=observed_bytes,
|
| 80 |
+
total_params=total_params,
|
| 81 |
+
candidates=(),
|
| 82 |
+
best=AnnotatedValue(
|
| 83 |
+
"UNKNOWN",
|
| 84 |
+
Label.UNKNOWN,
|
| 85 |
+
source="observed_bytes or total_params is zero",
|
| 86 |
+
),
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
candidates: list[ReconciliationCandidate] = []
|
| 90 |
+
for scheme, bpp in _QUANT_BPP.items():
|
| 91 |
+
if scheme == "UNKNOWN" or bpp == 0.0:
|
| 92 |
+
continue
|
| 93 |
+
predicted = int(bpp * total_params)
|
| 94 |
+
delta = observed_bytes - predicted
|
| 95 |
+
rel_err = abs(delta) / predicted if predicted else float("inf")
|
| 96 |
+
candidates.append(
|
| 97 |
+
ReconciliationCandidate(
|
| 98 |
+
scheme=scheme,
|
| 99 |
+
predicted_bytes=predicted,
|
| 100 |
+
delta_bytes=delta,
|
| 101 |
+
relative_error=rel_err,
|
| 102 |
+
)
|
| 103 |
+
)
|
| 104 |
+
candidates.sort(key=lambda c: c.relative_error)
|
| 105 |
+
|
| 106 |
+
argmin_scheme = candidates[0].scheme
|
| 107 |
+
argmin_err = candidates[0].relative_error
|
| 108 |
+
|
| 109 |
+
# Fingerprint path: authoritative declaration from config.json or safetensors
|
| 110 |
+
# header. This is the primary fix for the tie that LLM review caught.
|
| 111 |
+
if fingerprint is not None:
|
| 112 |
+
return _reconcile_with_fingerprint(
|
| 113 |
+
observed_bytes=observed_bytes,
|
| 114 |
+
total_params=total_params,
|
| 115 |
+
candidates=tuple(candidates),
|
| 116 |
+
fingerprint=fingerprint,
|
| 117 |
+
argmin_scheme=argmin_scheme,
|
| 118 |
+
argmin_err=argmin_err,
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
# Tolerance gate without fingerprint
|
| 122 |
+
if argmin_err > _UNKNOWN_THRESHOLD:
|
| 123 |
+
return ReconciliationReport(
|
| 124 |
+
observed_bytes=observed_bytes,
|
| 125 |
+
total_params=total_params,
|
| 126 |
+
candidates=tuple(candidates),
|
| 127 |
+
best=AnnotatedValue(
|
| 128 |
+
"UNKNOWN",
|
| 129 |
+
Label.UNKNOWN,
|
| 130 |
+
source=(
|
| 131 |
+
f"closest candidate ({argmin_scheme}) is off by "
|
| 132 |
+
f"{argmin_err * 100:.1f}% — no confident match"
|
| 133 |
+
),
|
| 134 |
+
),
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
# Bytes-only tie detection
|
| 138 |
+
tied_schemes = [
|
| 139 |
+
c.scheme
|
| 140 |
+
for c in candidates
|
| 141 |
+
if abs(c.relative_error - argmin_err) < _TIE_THRESHOLD
|
| 142 |
+
and c.relative_error <= _UNKNOWN_THRESHOLD
|
| 143 |
+
]
|
| 144 |
+
if len(tied_schemes) > 1:
|
| 145 |
+
tie_note = (
|
| 146 |
+
f" — tied with {', '.join(s for s in tied_schemes if s != argmin_scheme)} "
|
| 147 |
+
f"at the same bits/param; distinguishing requires config.json "
|
| 148 |
+
f"quantization_config or safetensors per-tensor dtype "
|
| 149 |
+
f"(neither available for this model)"
|
| 150 |
+
)
|
| 151 |
+
source_text = (
|
| 152 |
+
f"best match among {len(candidates)} candidates, "
|
| 153 |
+
f"{argmin_err * 100:.1f}% error{tie_note}"
|
| 154 |
+
)
|
| 155 |
+
else:
|
| 156 |
+
source_text = (
|
| 157 |
+
f"best match among {len(candidates)} candidates, {argmin_err * 100:.1f}% error"
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
return ReconciliationReport(
|
| 161 |
+
observed_bytes=observed_bytes,
|
| 162 |
+
total_params=total_params,
|
| 163 |
+
candidates=tuple(candidates),
|
| 164 |
+
best=AnnotatedValue(argmin_scheme, Label.INFERRED, source=source_text),
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def _reconcile_with_fingerprint(
|
| 169 |
+
observed_bytes: int,
|
| 170 |
+
total_params: int,
|
| 171 |
+
candidates: tuple[ReconciliationCandidate, ...],
|
| 172 |
+
fingerprint: QuantFingerprint,
|
| 173 |
+
argmin_scheme: QuantizationScheme,
|
| 174 |
+
argmin_err: float,
|
| 175 |
+
) -> ReconciliationReport:
|
| 176 |
+
"""Fingerprint-driven path.
|
| 177 |
+
|
| 178 |
+
Rules:
|
| 179 |
+
- If the declared scheme is in the candidates AND its bytes-error is within
|
| 180 |
+
tolerance → adopt it. Label VERIFIED (we're reading authoritative metadata,
|
| 181 |
+
not inferring).
|
| 182 |
+
- If declared scheme's bytes-error is > 15% → conflict. Still adopt the
|
| 183 |
+
declared scheme but log the discrepancy. This usually means our param
|
| 184 |
+
estimate is off, not that the declaration is wrong.
|
| 185 |
+
- If declared scheme is unknown to us → fall back to argmin with note.
|
| 186 |
+
"""
|
| 187 |
+
declared = fingerprint.scheme
|
| 188 |
+
match = next((c for c in candidates if c.scheme == declared), None)
|
| 189 |
+
|
| 190 |
+
if match is None:
|
| 191 |
+
# Unknown scheme from fingerprint — degrade gracefully to bytes-only.
|
| 192 |
+
return ReconciliationReport(
|
| 193 |
+
observed_bytes=observed_bytes,
|
| 194 |
+
total_params=total_params,
|
| 195 |
+
candidates=candidates,
|
| 196 |
+
best=AnnotatedValue(
|
| 197 |
+
argmin_scheme,
|
| 198 |
+
Label.INFERRED,
|
| 199 |
+
source=(
|
| 200 |
+
f"fingerprint declared {declared} ({fingerprint.evidence}) "
|
| 201 |
+
f"but we have no bpp anchor for it; fell back to bytes match "
|
| 202 |
+
f"{argmin_scheme} at {argmin_err * 100:.1f}% error"
|
| 203 |
+
),
|
| 204 |
+
),
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
if match.relative_error <= _UNKNOWN_THRESHOLD:
|
| 208 |
+
# Agreement — fingerprint picks a plausible scheme. This is the happy path.
|
| 209 |
+
note = ""
|
| 210 |
+
# Extra context: if bytes alone would have chosen a different scheme, say so.
|
| 211 |
+
if declared != argmin_scheme and argmin_err < match.relative_error:
|
| 212 |
+
note = (
|
| 213 |
+
f" (bytes alone would argmin to {argmin_scheme} at "
|
| 214 |
+
f"{argmin_err * 100:.1f}%; we trust the declaration)"
|
| 215 |
+
)
|
| 216 |
+
return ReconciliationReport(
|
| 217 |
+
observed_bytes=observed_bytes,
|
| 218 |
+
total_params=total_params,
|
| 219 |
+
candidates=candidates,
|
| 220 |
+
best=AnnotatedValue(
|
| 221 |
+
declared,
|
| 222 |
+
Label.VERIFIED,
|
| 223 |
+
source=(
|
| 224 |
+
f"{fingerprint.evidence} "
|
| 225 |
+
f"(predicts {match.predicted_bytes:,} bytes, "
|
| 226 |
+
f"{match.relative_error * 100:.1f}% error){note}"
|
| 227 |
+
),
|
| 228 |
+
),
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
# Disagreement: declared scheme's prediction is >15% off from observed bytes.
|
| 232 |
+
# Still trust the declaration — usually means our param estimate drifted.
|
| 233 |
+
return ReconciliationReport(
|
| 234 |
+
observed_bytes=observed_bytes,
|
| 235 |
+
total_params=total_params,
|
| 236 |
+
candidates=candidates,
|
| 237 |
+
best=AnnotatedValue(
|
| 238 |
+
declared,
|
| 239 |
+
Label.VERIFIED,
|
| 240 |
+
source=(
|
| 241 |
+
f"{fingerprint.evidence} "
|
| 242 |
+
f"(NOTE: bytes predict {match.predicted_bytes:,}, off by "
|
| 243 |
+
f"{match.relative_error * 100:.1f}% — likely our param estimate is off, "
|
| 244 |
+
f"not the declaration)"
|
| 245 |
+
),
|
| 246 |
+
),
|
| 247 |
+
)
|
src/llm_cal/weight_analyzer/safetensors_reader.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Fetch the safetensors header of one shard to recover per-tensor dtypes.
|
| 2 |
+
|
| 3 |
+
The safetensors binary format:
|
| 4 |
+
bytes[0..8] uint64 little-endian header length N (JSON bytes)
|
| 5 |
+
bytes[8..8+N] UTF-8 JSON tensor_name -> {dtype, shape, data_offsets}
|
| 6 |
+
bytes[8+N..] raw tensor data (we never read this)
|
| 7 |
+
|
| 8 |
+
So we can identify every tensor's dtype without downloading any weight bytes.
|
| 9 |
+
Headers are usually 50 KB - 2 MB. We cap the Range request at 16 MB as a
|
| 10 |
+
safety net; anything larger is treated as malformed.
|
| 11 |
+
|
| 12 |
+
This module NEVER raises on network or parse error — it returns None so
|
| 13 |
+
the caller can degrade gracefully. The honesty principle: "we tried and
|
| 14 |
+
failed to resolve the tie" is a legitimate outcome, not a fatal error.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from __future__ import annotations
|
| 18 |
+
|
| 19 |
+
import json
|
| 20 |
+
import struct
|
| 21 |
+
from typing import Any
|
| 22 |
+
|
| 23 |
+
import httpx
|
| 24 |
+
|
| 25 |
+
from llm_cal.model_source.auth import get_hf_token, get_modelscope_token
|
| 26 |
+
from llm_cal.model_source.base import SiblingFile
|
| 27 |
+
|
| 28 |
+
_MAX_HEADER_BYTES = 16 * 1024 * 1024 # 16 MB — far above any realistic header
|
| 29 |
+
_RANGE_FETCH_BYTES = 16 * 1024 * 1024
|
| 30 |
+
_DEFAULT_TIMEOUT_S = 15.0
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def pick_sample_shard(siblings: tuple[SiblingFile, ...]) -> SiblingFile | None:
|
| 34 |
+
"""Choose one safetensors file that's representative of the model.
|
| 35 |
+
|
| 36 |
+
Preference order:
|
| 37 |
+
1. `model.safetensors` (single-file case — always representative)
|
| 38 |
+
2. The middle shard for multi-shard models. The first shard tends to
|
| 39 |
+
contain embeddings + lm_head + early-layer norms (often left in
|
| 40 |
+
BF16/FP16 even when the bulk of the model is quantized to FP4 or
|
| 41 |
+
FP8). The middle shard typically holds real decoder/MoE-expert
|
| 42 |
+
weights, so its dtype histogram is more representative of the
|
| 43 |
+
"headline" quantization.
|
| 44 |
+
3. Any `*.safetensors` if naming doesn't follow the shard convention.
|
| 45 |
+
"""
|
| 46 |
+
st_files = [s for s in siblings if s.filename.endswith(".safetensors")]
|
| 47 |
+
if not st_files:
|
| 48 |
+
return None
|
| 49 |
+
|
| 50 |
+
for s in st_files:
|
| 51 |
+
if s.filename == "model.safetensors":
|
| 52 |
+
return s
|
| 53 |
+
|
| 54 |
+
sorted_shards = sorted(st_files, key=lambda s: s.filename)
|
| 55 |
+
return sorted_shards[len(sorted_shards) // 2]
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def fetch_tensor_dtypes(
|
| 59 |
+
source: str,
|
| 60 |
+
model_id: str,
|
| 61 |
+
revision: str,
|
| 62 |
+
shard_filename: str,
|
| 63 |
+
endpoint: str | None = None,
|
| 64 |
+
timeout_s: float = _DEFAULT_TIMEOUT_S,
|
| 65 |
+
) -> dict[str, str] | None:
|
| 66 |
+
"""Range-fetch the safetensors header of one shard and return dtype map.
|
| 67 |
+
|
| 68 |
+
Returns a dict of `{tensor_name: dtype_string}` on success, None on any
|
| 69 |
+
failure (network, parse, unexpected format). Non-fatal by design.
|
| 70 |
+
|
| 71 |
+
Supports HuggingFace and ModelScope. Other sources fall back to None
|
| 72 |
+
so the reconciler still reports a verdict (without per-tensor refinement).
|
| 73 |
+
"""
|
| 74 |
+
url, headers = _build_request(source, model_id, revision, shard_filename, endpoint)
|
| 75 |
+
if url is None:
|
| 76 |
+
return None
|
| 77 |
+
|
| 78 |
+
headers = {**headers, "Range": f"bytes=0-{_RANGE_FETCH_BYTES - 1}"}
|
| 79 |
+
|
| 80 |
+
try:
|
| 81 |
+
resp = httpx.get(url, headers=headers, timeout=timeout_s, follow_redirects=True)
|
| 82 |
+
except (httpx.TimeoutException, httpx.ConnectError, httpx.HTTPError):
|
| 83 |
+
return None
|
| 84 |
+
|
| 85 |
+
# 200 for small files returned in full; 206 for actual Range response.
|
| 86 |
+
# Anything else (404, 403, 500, ...) we degrade silently.
|
| 87 |
+
if resp.status_code not in (200, 206):
|
| 88 |
+
return None
|
| 89 |
+
|
| 90 |
+
return parse_header(resp.content)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def _build_request(
|
| 94 |
+
source: str,
|
| 95 |
+
model_id: str,
|
| 96 |
+
revision: str,
|
| 97 |
+
shard_filename: str,
|
| 98 |
+
endpoint: str | None,
|
| 99 |
+
) -> tuple[str | None, dict[str, str]]:
|
| 100 |
+
"""Compose URL + auth headers for the source. Returns (None, {}) on unknown."""
|
| 101 |
+
if source == "huggingface":
|
| 102 |
+
base = (endpoint or "https://huggingface.co").rstrip("/")
|
| 103 |
+
url = f"{base}/{model_id}/resolve/{revision}/{shard_filename}"
|
| 104 |
+
token = get_hf_token()
|
| 105 |
+
headers = {"Authorization": f"Bearer {token}"} if token else {}
|
| 106 |
+
return url, headers
|
| 107 |
+
if source == "modelscope":
|
| 108 |
+
# ModelScope raw-file endpoint takes the path via query string and
|
| 109 |
+
# 302-redirects to the underlying OSS object. httpx follows the
|
| 110 |
+
# redirect; OSS honors Range natively.
|
| 111 |
+
base = (endpoint or "https://www.modelscope.cn").rstrip("/")
|
| 112 |
+
# httpx will encode query params; build manually to keep this function
|
| 113 |
+
# ergonomically a one-liner that matches the rest of the module.
|
| 114 |
+
url = (
|
| 115 |
+
f"{base}/api/v1/models/{model_id}/repo"
|
| 116 |
+
f"?FilePath={shard_filename}&Revision={revision}"
|
| 117 |
+
)
|
| 118 |
+
token = get_modelscope_token()
|
| 119 |
+
headers = {"Authorization": f"Bearer {token}"} if token else {}
|
| 120 |
+
return url, headers
|
| 121 |
+
return None, {}
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def parse_header(content: bytes) -> dict[str, str] | None:
|
| 125 |
+
"""Parse the safetensors binary header from a leading byte buffer.
|
| 126 |
+
|
| 127 |
+
Pure function — safe to call on any bytes. Returns None on any malformed
|
| 128 |
+
input rather than raising.
|
| 129 |
+
"""
|
| 130 |
+
if len(content) < 8:
|
| 131 |
+
return None
|
| 132 |
+
|
| 133 |
+
try:
|
| 134 |
+
(header_len,) = struct.unpack("<Q", content[:8])
|
| 135 |
+
except struct.error:
|
| 136 |
+
return None
|
| 137 |
+
|
| 138 |
+
if header_len == 0 or header_len > _MAX_HEADER_BYTES:
|
| 139 |
+
return None
|
| 140 |
+
|
| 141 |
+
if len(content) < 8 + header_len:
|
| 142 |
+
return None
|
| 143 |
+
|
| 144 |
+
header_bytes = content[8 : 8 + header_len]
|
| 145 |
+
try:
|
| 146 |
+
header: Any = json.loads(header_bytes)
|
| 147 |
+
except (json.JSONDecodeError, UnicodeDecodeError):
|
| 148 |
+
return None
|
| 149 |
+
|
| 150 |
+
if not isinstance(header, dict):
|
| 151 |
+
return None
|
| 152 |
+
|
| 153 |
+
dtypes: dict[str, str] = {}
|
| 154 |
+
for name, info in header.items():
|
| 155 |
+
if name == "__metadata__":
|
| 156 |
+
continue
|
| 157 |
+
if not isinstance(info, dict):
|
| 158 |
+
continue
|
| 159 |
+
dtype = info.get("dtype")
|
| 160 |
+
if isinstance(dtype, str):
|
| 161 |
+
dtypes[name] = dtype
|
| 162 |
+
|
| 163 |
+
return dtypes if dtypes else None
|