Spaces:

bluemoonsoldout
/

llm-cal

Running

App Files Files Community

GitHub Actions commited on 28 days ago

Commit

cc6274a

0 Parent(s):

Auto-deploy from GitHub Actions

Browse files

Files changed (50) hide show

README.md +38 -0
app.py +2376 -0
requirements.txt +2 -0
src/llm_cal/__init__.py +6 -0
src/llm_cal/architecture/__init__.py +0 -0
src/llm_cal/architecture/detector.py +134 -0
src/llm_cal/architecture/formulas/__init__.py +0 -0
src/llm_cal/architecture/formulas/kv_cache.py +145 -0
src/llm_cal/architecture/formulas/weight.py +133 -0
src/llm_cal/architecture/profile.py +97 -0
src/llm_cal/architecture/traits.py +150 -0
src/llm_cal/benchmark/__init__.py +0 -0
src/llm_cal/benchmark/dataset.yaml +203 -0
src/llm_cal/benchmark/runner.py +232 -0
src/llm_cal/cli.py +207 -0
src/llm_cal/command_generator/__init__.py +0 -0
src/llm_cal/command_generator/sglang.py +50 -0
src/llm_cal/command_generator/vllm.py +55 -0
src/llm_cal/common/__init__.py +0 -0
src/llm_cal/common/i18n.py +421 -0
src/llm_cal/common/yaml_loader.py +48 -0
src/llm_cal/core/__init__.py +0 -0
src/llm_cal/core/cache.py +97 -0
src/llm_cal/core/evaluator.py +375 -0
src/llm_cal/core/explain.py +504 -0
src/llm_cal/engine_compat/__init__.py +0 -0
src/llm_cal/engine_compat/loader.py +118 -0
src/llm_cal/engine_compat/matrix.yaml +512 -0
src/llm_cal/fleet/__init__.py +0 -0
src/llm_cal/fleet/planner.py +282 -0
src/llm_cal/hardware/__init__.py +0 -0
src/llm_cal/hardware/gpu_database.yaml +613 -0
src/llm_cal/hardware/loader.py +77 -0
src/llm_cal/llm_review/__init__.py +0 -0
src/llm_cal/llm_review/reviewer.py +218 -0
src/llm_cal/model_source/__init__.py +0 -0
src/llm_cal/model_source/auth.py +33 -0
src/llm_cal/model_source/base.py +58 -0
src/llm_cal/model_source/huggingface.py +118 -0
src/llm_cal/model_source/modelscope.py +229 -0
src/llm_cal/output/__init__.py +0 -0
src/llm_cal/output/formatter.py +665 -0
src/llm_cal/output/labels.py +46 -0
src/llm_cal/performance/__init__.py +0 -0
src/llm_cal/performance/compute.py +233 -0
src/llm_cal/performance/concurrency.py +132 -0
src/llm_cal/weight_analyzer/__init__.py +146 -0
src/llm_cal/weight_analyzer/fingerprint.py +292 -0
src/llm_cal/weight_analyzer/reconciler.py +247 -0
src/llm_cal/weight_analyzer/safetensors_reader.py +163 -0

README.md ADDED Viewed

	@@ -0,0 +1,38 @@

+---
+title: llm-cal
+emoji: 🧮
+colorFrom: indigo
+colorTo: blue
+sdk: gradio
+sdk_version: 6.13.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+short_description: LLM inference sizing — honest, architecture-aware
+---
+# llm-cal — LLM inference hardware calculator
+Web UI for [`llm-cal`](https://github.com/FlyTOmeLight/llm-cal). Pick a model, pick a GPU, get a hardware plan.
+Architecture-aware (MLA, NSA, CSA+HCA, MoE, sliding window). Engine-aware (vLLM, SGLang). Honest-labeled — every number carries a provenance tag (`[verified]` / `[inferred]` / `[estimated]` / `[cited]` / `[unverified]` / `[unknown]`).
+## The story this Space exists to tell
+`gpu_poor` reports DeepSeek-V4-Flash as 284 GB by assuming pure FP8. The real safetensors weight is 160 GB — it ships an FP4+FP8 mixed pack. `llm-cal` reads the actual on-disk dtype (per-tensor metadata + MX block-scaled scale tensors) and gets 160.01 GB at **0.2% error**.
+That's the whole pitch.
+## Local
+```bash
+pip install llm-cal gradio
+python app.py
+```
+## Links
+- [GitHub repo](https://github.com/FlyTOmeLight/llm-cal)
+- [Full docs](https://flytomelight.github.io/llm-cal/)
+- [Methodology](https://flytomelight.github.io/llm-cal/methodology/) — every formula's primary source
+- [Pre-rendered model pages](https://flytomelight.github.io/llm-cal/models/) — popular model × GPU combos

app.py ADDED Viewed

	@@ -0,0 +1,2376 @@

+"""llm-cal Gradio web app — deploys to HuggingFace Spaces.
+User journey:
+  1. Type a HuggingFace model id (or pick from examples)
+  2. Choose target GPU
+  3. Hit Calculate
+  4. Read the same `--explain`-quality output the CLI gives you, but in a browser
+     and shareable via URL parameters.
+The whole compute is the existing Python `Evaluator`. No new logic.
+Local run:
+  python web/app.py
+HF Spaces:
+  This file is the entry point Spaces expects. requirements.txt sits next to it.
+"""
+from __future__ import annotations
+import sys
+from pathlib import Path
+# Ensure src/ is importable. Two layouts supported:
+#   1. Local dev:  /repo/web/app.py + /repo/src/        (parent.parent / src)
+#   2. HF Space:   /space/app.py    + /space/src/       (parent / src)
+# The deploy workflow flattens layout 1 → layout 2 when pushing to the Space.
+_HERE = Path(__file__).resolve().parent
+for _candidate in (_HERE / "src", _HERE.parent / "src"):
+    if _candidate.exists():
+        sys.path.insert(0, str(_candidate))
+        break
+import os  # noqa: E402
+import gradio as gr  # noqa: E402
+from llm_cal.common.i18n import set_locale, t  # noqa: E402
+from llm_cal.core.evaluator import EvaluationReport, Evaluator  # noqa: E402
+from llm_cal.core.explain import ExplainEntry  # noqa: E402
+from llm_cal.core.explain import build as build_explain  # noqa: E402
+from llm_cal.hardware.loader import load_database  # noqa: E402
+from llm_cal.llm_review.reviewer import run_review  # noqa: E402
+from llm_cal.model_source.huggingface import HuggingFaceSource  # noqa: E402
+from llm_cal.model_source.modelscope import ModelScopeSource  # noqa: E402
+# ---------------------------------------------------------------------------
+# Static data the UI needs
+_DB = load_database()
+def _classify_vendor(gpu_id: str) -> tuple[str, str]:
+    """Map a GPU id to (vendor_en, vendor_zh).
+    Vendor isn't in the YAML schema (yet), so derive from the id prefix.
+    """
+    gid = gpu_id.upper()
+    if gid in {"B200", "GB200", "H100", "H800", "H200", "H20", "GH200"} or gid.startswith(
+        ("L4", "L40", "RTX", "A10", "A100", "A40", "V100", "T4")
+    ):
+        return ("NVIDIA", "NVIDIA")
+    if gid.startswith("MI"):
+        return ("AMD", "AMD")
+    if gid.startswith("GAUDI"):
+        return ("Intel Habana", "英特尔 Habana")
+    if gid.startswith("910") or gid.startswith("ATLAS"):
+        return ("Huawei Ascend", "华为昇腾")
+    if gid.startswith("MXC"):
+        return ("MetaX 沐曦", "沐曦 MetaX")
+    if gid.startswith("KUNLUN"):
+        return ("Kunlunxin 昆仑芯", "昆仑芯 Kunlunxin")
+    if gid.startswith("BR"):
+        return ("Biren 壁仞", "壁仞 Biren")
+    if gid.startswith("BI-"):
+        return ("Iluvatar 天数智芯", "天数智芯 Iluvatar")
+    if gid.startswith(("MR-", "MTT")):
+        return ("Moore Threads 摩尔线程", "摩尔线程 Moore Threads")
+    if gid.startswith("MLU"):
+        return ("Cambricon 寒武纪", "寒武纪 Cambricon")
+    if gid.startswith("HYGON"):
+        return ("Hygon 海光", "海光 Hygon")
+    return ("Other", "其他")
+# Stable vendor display order
+_VENDOR_ORDER = [
+    "NVIDIA",
+    "AMD",
+    "Intel Habana",
+    "Huawei Ascend",
+    "MetaX 沐曦",
+    "Kunlunxin 昆仑芯",
+    "Biren 壁仞",
+    "Iluvatar 天数智芯",
+    "Moore Threads 摩尔线程",
+    "Cambricon 寒武纪",
+    "Hygon 海光",
+    "Other",
+]
+def _build_vendor_index() -> dict[str, list[str]]:
+    """vendor_en -> sorted list of GPU ids"""
+    out: dict[str, list[str]] = {v: [] for v in _VENDOR_ORDER}
+    for g in _DB.gpus:
+        v_en, _ = _classify_vendor(g.id)
+        out.setdefault(v_en, []).append(g.id)
+    for v in out:
+        out[v].sort()
+    # Drop empty buckets
+    return {v: ids for v, ids in out.items() if ids}
+_VENDOR_TO_GPUS = _build_vendor_index()
+VENDOR_CHOICES_EN: list[str] = list(_VENDOR_TO_GPUS.keys())
+DEFAULT_VENDOR = "NVIDIA"
+DEFAULT_GPU = "H800"
+EXAMPLE_MODELS: list[tuple[str, str, str, str, str]] = [
+    # (model_id, vendor, gpu, engine, source)
+    ("deepseek-ai/DeepSeek-V4-Flash", "NVIDIA", "H800", "vllm", "HuggingFace"),
+    ("deepseek-ai/DeepSeek-V3", "NVIDIA", "H800", "vllm", "HuggingFace"),
+    ("Qwen/Qwen2.5-72B-Instruct", "NVIDIA", "H100", "vllm", "HuggingFace"),
+    ("Qwen/Qwen3-30B-A3B", "NVIDIA", "A100-80G", "vllm", "HuggingFace"),
+    ("mistralai/Mixtral-8x7B-v0.1", "NVIDIA", "H100", "vllm", "HuggingFace"),
+    ("microsoft/Phi-4", "NVIDIA", "RTX4090", "vllm", "HuggingFace"),
+    ("deepseek-ai/DeepSeek-V4-Flash", "Huawei Ascend", "910B4", "vllm", "HuggingFace"),
+    # ModelScope examples — same models, China-side mirror.
+    ("Qwen/Qwen3-30B-A3B", "NVIDIA", "A100-80G", "vllm", "ModelScope"),
+    ("deepseek-ai/DeepSeek-V3", "Huawei Ascend", "910B4", "vllm", "ModelScope"),
+]
+# ---------------------------------------------------------------------------
+# Output rendering
+def _fmt_bytes(n: int | None) -> str:
+    if n is None:
+        return "—"
+    if n < 1024:
+        return f"{n} B"
+    f = float(n)
+    for u in ["KB", "MB", "GB", "TB"]:
+        f /= 1024
+        if f < 1024:
+            return f"{f:.2f} {u}"
+    return f"{f:.2f} PB"
+def _fmt_params(n: int | None) -> str:
+    if not n:
+        return "—"
+    if n >= 1_000_000_000:
+        return f"{n / 1_000_000_000:.1f}B"
+    if n >= 1_000_000:
+        return f"{n / 1_000_000:.1f}M"
+    return f"{n:,}"
+def _label_color(label: str) -> str:
+    """Map a provenance label to a CSS color (visible in both light and dark)."""
+    return {
+        "verified": "#16a34a",  # green-600
+        "inferred": "#2563eb",  # blue-600
+        "estimated": "#d97706",  # amber-600
+        "cited": "#7c3aed",  # violet-600
+        "unverified": "#9a3412",  # orange-800
+        "unknown": "#6b7280",  # gray-500
+        "llm-opinion": "#db2777",  # pink-600
+    }.get(label, "#6b7280")
+def _label_chip(label_key: str) -> str:
+    """Render a [label] chip with the right color."""
+    color = _label_color(label_key)
+    text = t(f"label.{label_key}")
+    return (
+        f'<span class="lc-chip" style="background:{color}1a;color:{color};'
+        f'border:1px solid {color}55">{text}</span>'
+    )
+def _stat_card(label: str, value: str, sublabel: str = "", chip: str = "") -> str:
+    chip_html = f"<div class='lc-stat-chip'>{chip}</div>" if chip else ""
+    sub_html = f"<div class='lc-stat-sub'>{sublabel}</div>" if sublabel else ""
+    return (
+        f"<div class='lc-stat'>"
+        f"<div class='lc-stat-value'>{value}</div>"
+        f"<div class='lc-stat-label'>{label}</div>"
+        f"{sub_html}{chip_html}"
+        f"</div>"
+    )
+def _esc(s: str) -> str:
+    return (
+        str(s)
+        .replace("&", "&amp;")
+        .replace("<", "&lt;")
+        .replace(">", "&gt;")
+    )
+def _render(report: EvaluationReport, locale: str) -> str:
+    set_locale(locale)  # type: ignore[arg-type]
+    is_zh = locale == "zh"
+    p, w, r, f = report.profile, report.weight, report.reconciliation, report.fleet
+    # ---- Headline stat cards -------------------------------------------------
+    weight_str = _fmt_bytes(w.total_bytes.value)
+    weight_chip = _label_chip(w.total_bytes.label.value)
+    quant_chip = _label_chip(w.quantization_guess.label.value)
+    prod_opt = (
+        next((o for o in (f.options if f else []) if o.tier == "prod"), None) if f else None
+    )
+    prod_gpus = str(prod_opt.gpu_count) if prod_opt else "—"
+    prod_concurrent = str(prod_opt.max_concurrent_at_reference_ctx) if prod_opt else "—"
+    headline = (
+        f"<div class='lc-header'>"
+        f"<div class='lc-title'>{_esc(report.model_id)}</div>"
+        f"<div class='lc-subtitle'>"
+        f"{_esc(report.gpu)} · {_esc(report.engine)}"
+        f"</div></div>"
+        f"<div class='lc-stats'>"
+        f"{_stat_card('Weight' if not is_zh else '权重', weight_str, sublabel='from safetensors API' if not is_zh else '取自 safetensors API', chip=weight_chip)}"
+        f"{_stat_card('Quantization' if not is_zh else '量化', _esc(w.quantization_guess.value), sublabel='resolved scheme' if not is_zh else '已识别方案', chip=quant_chip)}"
+        f"{_stat_card('Prod GPUs' if not is_zh else 'Prod GPU 数', prod_gpus, sublabel='for 16-user prod' if not is_zh else '生产档（16 路并发）')}"
+        f"{_stat_card('Users @ 128K' if not is_zh else '用户 @ 128K', prod_concurrent, sublabel='concurrent at prod tier' if not is_zh else '生产档的并发')}"
+        f"</div>"
+    )
+    # Provenance footer for the headline
+    quant_source = _esc(w.quantization_guess.source or "")
+    headline += f"<div class='lc-prov'>{quant_source}</div>"
+    # ---- Architecture --------------------------------------------------------
+    arch_rows: list[tuple[str, str]] = [("model_type", p.model_type)]
+    if p.attention:
+        arch_rows.append(
+            (
+                "attention",
+                f"{p.attention.variant} (heads={p.attention.num_heads}, "
+                f"kv_heads={p.attention.num_kv_heads}, hd={p.attention.head_dim})",
+            )
+        )
+    if p.moe:
+        arch_rows.append(
+            (
+                "moe",
+                f"{p.moe.num_routed_experts} routed + "
+                f"{p.moe.num_shared_experts} shared, top-{p.moe.num_experts_per_tok}",
+            )
+        )
+    if p.sliding_window:
+        arch_rows.append(("sliding_window", str(p.sliding_window)))
+    arch_html = "".join(
+        f"<tr><td><code>{_esc(k)}</code></td><td><code>{_esc(v)}</code></td></tr>"
+        for k, v in arch_rows
+    )
+    arch_explainer = (
+        "从模型 config.json 读出来的，决定后续所有公式怎么走（是否分组注意力、是否 MoE、是否滑动窗口）。"
+        if is_zh
+        else "Read straight from the model's config.json. Drives every formula "
+        "downstream — attention sharding, MoE active-expert ratio, sliding window."
+    )
+    arch_section = (
+        f"<div class='lc-section'><h3>{'架构' if is_zh else 'Architecture'}</h3>"
+        f"<div class='lc-section-help'>{arch_explainer}</div>"
+        f"<table class='lc-table'>{arch_html}</table></div>"
+    )
+    # ---- Reconciliation ------------------------------------------------------
+    recon_rows = []
+    for c in r.candidates[:5]:
+        is_best = c.scheme == r.best.value
+        cls = " class='lc-best'" if is_best else ""
+        marker = " ✓" if is_best else ""
+        recon_rows.append(
+            f"<tr{cls}><td><code>{_esc(c.scheme)}</code>{marker}</td>"
+            f"<td>{_fmt_bytes(c.predicted_bytes)}</td>"
+            f"<td>{c.relative_error * 100:.1f}%</td></tr>"
+        )
+    recon_explainer = (
+        "用每种量化方案预测应该有多少字节，跟实际 safetensors 字节对比。误差最小的胜出。"
+        "FP4_FP8_MIXED / GPTQ_INT4 / AWQ_INT4 在 0.55 bpp 处会打平，需要 config 或 dtype 进一步区分。"
+        if is_zh
+        else "Predict bytes under each quantization hypothesis, compare against the real "
+        "safetensors size. Lowest error wins. FP4_FP8_MIXED / GPTQ_INT4 / AWQ_INT4 tie "
+        "at 0.55 bpp — broken via config.json or per-tensor dtype."
+    )
+    recon_section = (
+        f"<div class='lc-section'>"
+        f"<h3>{'量化反演' if is_zh else 'Quantization reconciliation'}</h3>"
+        f"<div class='lc-section-help'>{recon_explainer}</div>"
+        f"<table class='lc-table lc-table-recon'>"
+        f"<thead><tr><th>Scheme</th>"
+        f"<th>{'预测字节' if is_zh else 'Predicted'}</th>"
+        f"<th>{'误差' if is_zh else 'Error'}</th></tr></thead>"
+        f"<tbody>{''.join(recon_rows)}</tbody></table></div>"
+    )
+    # ---- Fleet ---------------------------------------------------------------
+    fleet_section = ""
+    if f and f.options:
+        # Pick which context lengths get their own concurrency column.
+        # Always include 128K if any option has it; also include the model max
+        # if it's larger (e.g. 1M for DeepSeek-V4-Flash) so the user can compare
+        # "fits 23 users at 128K but only 2 at 1M".
+        all_ctxs: set[int] = set()
+        for opt in f.options:
+            for ctx, _ in opt.max_concurrent_by_context:
+                all_ctxs.add(ctx)
+        ctx_cols: list[int] = []
+        if 131_072 in all_ctxs:
+            ctx_cols.append(131_072)
+        max_ctx = max(all_ctxs) if all_ctxs else 0
+        if max_ctx > 131_072 and max_ctx not in ctx_cols:
+            ctx_cols.append(max_ctx)
+        if not ctx_cols and all_ctxs:
+            ctx_cols.append(max_ctx)
+        def _ctx_label(ctx: int) -> str:
+            if ctx >= 1_000_000:
+                return f"{ctx // 1_000_000}M" if ctx % 1_000_000 == 0 else f"{ctx / 1_000_000:.1f}M"
+            if ctx >= 1024:
+                return f"{ctx // 1024}K"
+            return str(ctx)
+        rows = []
+        for opt in f.options:
+            star = " ★" if opt.tier == f.best_tier else ""
+            cls = " class='lc-best'" if opt.tier == f.best_tier else ""
+            headroom = max(0, opt.usable_bytes_per_gpu - opt.weight_bytes_per_gpu)
+            ctx_map = dict(opt.max_concurrent_by_context)
+            ctx_cells = "".join(f"<td>{ctx_map.get(c, '—')}</td>" for c in ctx_cols)
+            rows.append(
+                f"<tr{cls}><td><code>{opt.tier}{star}</code></td>"
+                f"<td>{opt.gpu_count}</td>"
+                f"<td>{_fmt_bytes(opt.weight_bytes_per_gpu)}</td>"
+                f"<td>{_fmt_bytes(headroom)}</td>"
+                f"{ctx_cells}</tr>"
+            )
+        ctx_headers = "".join(
+            f"<th>{('@ ' + _ctx_label(c) + ' 并发') if is_zh else ('Concurrent @ ' + _ctx_label(c))}</th>"
+            for c in ctx_cols
+        )
+        fleet_explainer = (
+            "min = 刚好放得下；dev = 8 路并发场景；prod = 16 路并发场景。★ = 推荐。"
+            if is_zh
+            else "min = barely fits weights; dev = sized for 8 concurrent at 128K; "
+            "prod = sized for 16 concurrent at 128K. ★ = recommended."
+        )
+        fleet_section = (
+            f"<div class='lc-section'>"
+            f"<h3>{'推荐集群' if is_zh else 'Recommended fleet'}</h3>"
+            f"<div class='lc-section-help'>{fleet_explainer}</div>"
+            f"<table class='lc-table'>"
+            f"<thead><tr><th>Tier</th><th>GPUs</th>"
+            f"<th>Weight/GPU</th><th>Headroom/GPU</th>"
+            f"{ctx_headers}</tr></thead>"
+            f"<tbody>{''.join(rows)}</tbody></table></div>"
+        )
+    # ---- Performance ---------------------------------------------------------
+    perf_explainer = (
+        "Prefill 用算力公式（FLOPs = 2 × 参数 × 输入 token），decode 用带宽公式（吞吐 = 带宽 × 利用率 / 权重字节）。"
+        "Bottleneck 标 memory_bandwidth 说明 decode 是带宽瓶颈，加显存带宽更高的 GPU 比加算力更划算。"
+        if is_zh
+        else "Prefill uses the compute formula (FLOPs = 2 × params × input_tokens, Kaplan 2020). "
+        "Decode uses memory-bandwidth formula (tps = BW × util / weight_bytes, vLLM paper). "
+        "Bottleneck = memory_bandwidth means a higher-BW GPU helps more than more FLOPS."
+    )
+    perf_section = ""
+    if report.prefill and report.decode and report.concurrency:
+        max_users = report.concurrency.max_concurrent.value
+        bn = report.concurrency.bottleneck
+        items = [
+            (
+                "Prefill latency" if not is_zh else "Prefill 延迟",
+                f"{report.prefill.latency_ms.value:.0f} ms",
+                f"@ {report.perf_input_tokens or 2000} input tokens",
+            ),
+            (
+                "Cluster decode" if not is_zh else "集群 decode 吞吐",
+                f"{report.decode.cluster_tokens_per_sec.value:.0f} tok/s",
+                "",
+            ),
+            (
+                "Max concurrent users" if not is_zh else "最大并发用户",
+                str(max_users),
+                "",
+            ),
+            (
+                "Bottleneck" if not is_zh else "瓶颈",
+                f"<code>{_esc(bn)}</code>",
+                "",
+            ),
+        ]
+        items_html = "".join(
+            f"<div class='lc-perf-item'>"
+            f"<div class='lc-perf-value'>{v}</div>"
+            f"<div class='lc-perf-label'>{_esc(label)}</div>"
+            f"<div class='lc-perf-sub'>{_esc(sub)}</div></div>"
+            for label, v, sub in items
+        )
+        perf_section = (
+            f"<div class='lc-section'>"
+            f"<h3>{'性能' if is_zh else 'Performance'}</h3>"
+            f"<div class='lc-section-help'>{perf_explainer}</div>"
+            f"<div class='lc-perf'>{items_html}</div></div>"
+        )
+    # ---- KV cache per request -----------------------------------------------
+    kv_section = ""
+    if report.kv_cache_by_context:
+        rows = []
+        for ctx, av in sorted(report.kv_cache_by_context.items()):
+            rows.append(
+                f"<tr><td>{ctx:,}</td><td>{_fmt_bytes(av.value)}</td>"
+                f"<td>{_label_chip(av.label.value)}</td></tr>"
+            )
+        kv_explainer = (
+            "单个请求在不同 context 长度下需要多少 KV 缓存。这是决定一张 GPU 能并发跑多少请求的关键。"
+            "MLA / MQA 模型这里会比标准 GQA 小很多。"
+            if is_zh
+            else "How much KV cache one request consumes at each context length. "
+            "This is what limits per-GPU concurrency. MLA / MQA models are "
+            "dramatically smaller here than standard GQA."
+        )
+        kv_section = (
+            f"<div class='lc-section'>"
+            f"<h3>{'KV 缓存（每请求）' if is_zh else 'KV cache per request'}</h3>"
+            f"<div class='lc-section-help'>{kv_explainer}</div>"
+            f"<table class='lc-table lc-table-recon'>"
+            f"<thead><tr><th>{'Context tokens' if not is_zh else 'Context 长度'}</th>"
+            f"<th>{'KV bytes' if not is_zh else 'KV 字节'}</th>"
+            f"<th>{'Label' if not is_zh else '标签'}</th></tr></thead>"
+            f"<tbody>{''.join(rows)}</tbody></table></div>"
+        )
+    # ---- Engine compatibility -----------------------------------------------
+    engine_section = ""
+    em = report.engine_match
+    if em:
+        def _fmt_flag(f) -> str:  # noqa: ANN001
+            base = f"{f.flag} {f.value}".strip()
+            return base
+        flags = ", ".join(_fmt_flag(f) for f in em.required_flags) if em.required_flags else "—"
+        opt_flags = ", ".join(_fmt_flag(f) for f in em.optional_flags) if em.optional_flags else "—"
+        caveats = em.caveats_zh if is_zh else em.caveats_en
+        sources_html = "—"
+        if em.sources:
+            sources_html = "<br>".join(
+                f'<a href="{_esc(s.url)}" target="_blank" rel="noopener">{_esc(s.url)}</a>'
+                + (
+                    f" <span class='lc-prov'>({_esc(s.captured_date)})</span>"
+                    if s.captured_date
+                    else ""
+                )
+                for s in em.sources
+            )
+        rows = [
+            (("引擎" if is_zh else "Engine"), f"<code>{_esc(em.engine)}</code>"),
+            (
+                ("版本要求" if is_zh else "Version"),
+                f"<code>{_esc(em.version_spec)}</code>",
+            ),
+            (
+                ("支持级别" if is_zh else "Support"),
+                _label_chip(em.support) if em.support in {"verified", "cited", "unverified"} else f"<code>{_esc(em.support)}</code>",
+            ),
+            (
+                ("验证级别" if is_zh else "Verification"),
+                _label_chip(em.verification_level),
+            ),
+            (("必需 flag" if is_zh else "Required flags"), f"<code>{_esc(flags)}</code>"),
+            (("可选 flag" if is_zh else "Optional flags"), f"<code>{_esc(opt_flags)}</code>"),
+        ]
+        if caveats:
+            rows.append((("注意事项" if is_zh else "Caveats"), _esc(caveats)))
+        rows.append((("来源" if is_zh else "Sources"), sources_html))
+        body = "".join(f"<tr><td>{k}</td><td>{v}</td></tr>" for k, v in rows)
+        engine_explainer = (
+            "这个模型在 vLLM/SGLang 哪个版本起能跑、需要哪些必需 flag、有哪些优化 flag。"
+            "verification_level 标 cited 表示从 PR / release note 引用，verified 表示实测过。"
+            if is_zh
+            else "Which engine version supports this model, what flags are required, "
+            "and which optional flags help. verification_level=cited means we got it "
+            "from a PR or release note; verified means we actually ran it."
+        )
+        engine_section = (
+            f"<div class='lc-section'>"
+            f"<h3>{'引擎兼容性' if is_zh else 'Engine compatibility'}</h3>"
+            f"<div class='lc-section-help'>{engine_explainer}</div>"
+            f"<table class='lc-table'>{body}</table></div>"
+        )
+    # ---- GPU spec ------------------------------------------------------------
+    gpu_section = ""
+    g = report.gpu_spec
+    if g:
+        notes = g.notes_zh if is_zh else g.notes_en
+        rows = [
+            ("HBM", f"{g.memory_gb} GB"),
+            ("Memory BW", f"{g.memory_bandwidth_gbps or '—'} GB/s"),
+            ("NVLink BW", f"{g.nvlink_bandwidth_gbps} GB/s"),
+            ("FP16 TFLOPS", f"{g.fp16_tflops}"),
+            ("FP8", "✓" if g.fp8_support else "—"),
+            ("FP4", "✓" if g.fp4_support else "—"),
+        ]
+        rows_html = "".join(
+            f"<tr><td>{_esc(k)}</td><td><code>{_esc(v)}</code></td></tr>"
+            for k, v in rows
+        )
+        notes_html = (
+            f"<div class='lc-prov' style='margin-top:8px'>{_esc(notes)}</div>" if notes else ""
+        )
+        source_html = (
+            f"<div class='lc-prov'>{'来源' if is_zh else 'Source'}: "
+            f"<a href='{_esc(g.spec_source)}' target='_blank' rel='noopener'>"
+            f"{_esc(g.spec_source)}</a></div>"
+            if g.spec_source and g.spec_source.startswith("http")
+            else (f"<div class='lc-prov'>{_esc(g.spec_source)}</div>" if g.spec_source else "")
+        )
+        gpu_explainer = (
+            "目标 GPU 的硬件规格。Memory BW 决定 decode 能跑多快，FP8/FP4 支持决定能用什么量化。"
+            if is_zh
+            else "Hardware spec of the chosen GPU. Memory BW caps decode throughput; "
+            "FP8/FP4 support determines which quantization paths actually accelerate."
+        )
+        gpu_section = (
+            f"<div class='lc-section'>"
+            f"<h3>{'目标 GPU 规格' if is_zh else 'Target GPU spec'} — <code>{_esc(g.id)}</code></h3>"
+            f"<div class='lc-section-help'>{gpu_explainer}</div>"
+            f"<table class='lc-table'>{rows_html}</table>"
+            f"{notes_html}{source_html}"
+            f"</div>"
+        )
+    # ---- Generated command ---------------------------------------------------
+    cmd_section = ""
+    if report.generated_command:
+        cmd_explainer = (
+            "可以直接复制粘贴到带显卡的机器上跑。flag 是按推荐 tier 的 GPU 数 + 引擎兼容矩阵的必需 flag 自动拼的。"
+            if is_zh
+            else "Copy-pasteable on a machine with the right GPUs. Flags auto-assembled "
+            "from the recommended fleet tier + engine compat matrix's required flags."
+        )
+        cmd_section = (
+            f"<div class='lc-section'>"
+            f"<h3>{'生成命令' if is_zh else 'Generated command'}</h3>"
+            f"<div class='lc-section-help'>{cmd_explainer}</div>"
+            f"<pre class='lc-cmd'><code>{_esc(report.generated_command)}</code></pre></div>"
+        )
+    return (
+        "<div class='lc-result'>"
+        + headline
+        + arch_section
+        + gpu_section
+        + recon_section
+        + kv_section
+        + fleet_section
+        + perf_section
+        + engine_section
+        + cmd_section
+        + _render_star_cta(is_zh)
+        + "</div>"
+    )
+def _render_compare(reports: list[EvaluationReport], locale: str) -> str:
+    """Side-by-side comparison of N >= 2 reports for the same model on
+    different GPUs.
+    Each metric column declares whether higher or lower is better and we
+    paint the winner cell in green so the eye snaps to it.
+    """
+    set_locale(locale)  # type: ignore[arg-type]
+    is_zh = locale == "zh"
+    # All reports share the same model_id + engine — pull from the first.
+    head = reports[0]
+    title = (
+        f"<div class='lc-header'>"
+        f"<div class='lc-title'>{_esc(head.model_id)}</div>"
+        f"<div class='lc-subtitle'>"
+        f"{('对比 ' + str(len(reports)) + ' 张 GPU') if is_zh else ('Comparing ' + str(len(reports)) + ' GPUs')}"
+        f" · {_esc(head.engine)}"
+        f"</div></div>"
+    )
+    # Metric definitions: (label_en, label_zh, value_fn, better=lower|higher|info, formatter)
+    # "info" rows are not contested — used for model-determined facts (same across
+    # GPUs by construction) or for descriptive cells like Bottleneck.
+    def _max_concurrent(r: EvaluationReport) -> int | None:
+        if not r.fleet:
+            return None
+        prod = next((o for o in r.fleet.options if o.tier == "prod"), None)
+        return prod.max_concurrent_at_reference_ctx if prod else None
+    def _prod_gpu_count(r: EvaluationReport) -> int | None:
+        if not r.fleet:
+            return None
+        prod = next((o for o in r.fleet.options if o.tier == "prod"), None)
+        return prod.gpu_count if prod else None
+    def _kv_per_user_128k(r: EvaluationReport) -> int | None:
+        av = r.kv_cache_by_context.get(131072)
+        return av.value if av is not None else None
+    def _native_precision_score(r: EvaluationReport) -> int | None:
+        g = r.gpu_spec
+        if g is None:
+            return None
+        return (1 if g.fp8_support else 0) + (1 if g.fp4_support else 0)
+    def _fmt_native(v: int | None) -> str:
+        if v is None:
+            return "—"
+        return {0: "FP16 only", 1: "FP8", 2: "FP8 + FP4"}.get(v, str(v))
+    def _max_context_tokens(r: EvaluationReport) -> int | None:
+        """Effective max context the model claims to support.
+        In modern HF configs (LLaMA 3+, DeepSeek V3+, Qwen2.5+), the field
+        max_position_embeddings already reflects the post-RoPE/YaRN-scaling
+        window. rope_scaling_factor is recorded for provenance but must NOT
+        be multiplied in again — that double-counts.
+        """
+        pos = r.profile.position
+        if pos is None or pos.max_position_embeddings is None:
+            return None
+        return int(pos.max_position_embeddings)
+    def _fmt_context(v: int | None) -> str:
+        """Binary-base formatting so 131072 reads as '128K' not '131K'."""
+        if v is None:
+            return "—"
+        if v >= 1024 * 1024:
+            return f"{v / (1024 * 1024):.1f}M".replace(".0M", "M")
+        if v >= 1024:
+            return f"{v // 1024}K"
+        return str(v)
+    def _cluster_qps(r: EvaluationReport) -> float | None:
+        """Steady-state queries/sec the cluster sustains:
+        QPS = cluster_decode_tokens_per_sec / output_tokens_per_request."""
+        if not r.decode or r.decode.cluster_tokens_per_sec.value <= 0:
+            return None
+        out = r.perf_output_tokens or 512
+        if out <= 0:
+            return None
+        return r.decode.cluster_tokens_per_sec.value / out
+    metrics = [
+        # ── Model-determined rows (info; identical across GPUs by definition) ──
+        ("Quantization", "量化方案",
+         lambda r: r.weight.quantization_guess.value, "info",
+         lambda v: _esc(str(v)) if v else "—"),
+        ("Weights total", "权重总量",
+         lambda r: r.weight.total_bytes.value, "info",
+         lambda v: _fmt_bytes(v) if v else "—"),
+        ("KV / user @ 128K", "KV / 用户 @ 128K",
+         _kv_per_user_128k, "info",
+         lambda v: _fmt_bytes(v) if v is not None else "—"),
+        ("Max context", "最大上下文",
+         _max_context_tokens, "info",
+         _fmt_context),
+        # ── GPU hardware specs (contested) ──
+        ("HBM / card", "单卡显存",
+         lambda r: r.gpu_spec.memory_gb if r.gpu_spec else None, "higher",
+         lambda v: f"{v} GB" if v is not None else "—"),
+        ("HBM bandwidth", "显存带宽",
+         lambda r: r.gpu_spec.memory_bandwidth_gbps if r.gpu_spec else None, "higher",
+         lambda v: f"{v:,} GB/s" if v is not None else "—"),
+        ("NVLink / card", "NVLink 带宽",
+         lambda r: r.gpu_spec.nvlink_bandwidth_gbps if r.gpu_spec else None, "higher",
+         lambda v: (f"{v} GB/s" if v else "无") if v is not None else "—"),
+        ("Native FP8/FP4", "原生低精度",
+         _native_precision_score, "higher",
+         _fmt_native),
+        # ── Sizing & performance outcomes (contested) ──
+        ("Prod GPUs", "生产档 GPU 数",
+         _prod_gpu_count, "lower",
+         lambda v: str(v) if v is not None else "—"),
+        ("Users @ 128K", "用户 @ 128K",
+         _max_concurrent, "higher",
+         lambda v: str(v) if v is not None else "—"),
+        ("Prefill latency", "Prefill 延迟",
+         lambda r: r.prefill.latency_ms.value if r.prefill else None, "lower",
+         lambda v: f"{v:.0f} ms" if v is not None else "—"),
+        ("Per-GPU decode", "单卡 decode 吞吐",
+         lambda r: r.decode.per_gpu_tokens_per_sec.value if r.decode else None, "higher",
+         lambda v: f"{v:.0f} tok/s" if v is not None else "—"),
+        ("Cluster decode", "集群 decode 吞吐",
+         lambda r: r.decode.cluster_tokens_per_sec.value if r.decode else None, "higher",
+         lambda v: f"{v:.0f} tok/s" if v is not None else "—"),
+        ("Sustained QPS", "稳态 QPS",
+         _cluster_qps, "higher",
+         lambda v: f"{v:.2f} q/s" if v is not None else "—"),
+        # ── Diagnostic (info — string, not a number race) ──
+        ("Bottleneck", "瓶颈",
+         lambda r: r.concurrency.bottleneck if r.concurrency else None, "info",
+         lambda v: f"<code>{_esc(str(v))}</code>" if v else "—"),
+    ]
+    # GPU column headers
+    gpu_headers = "".join(
+        f"<th class='lc-cmp-gpu'>{_esc(r.gpu)}</th>" for r in reports
+    )
+    rows_html = []
+    for label_en, label_zh, getter, better, fmt in metrics:
+        values = [getter(r) for r in reports]
+        # Pick the winning index. None values are excluded from the contest.
+        winner_idx: int | None = None
+        if better in ("higher", "lower"):
+            numeric_pairs = [(i, v) for i, v in enumerate(values) if isinstance(v, (int, float))]
+            if numeric_pairs:
+                if better == "higher":
+                    winner_idx = max(numeric_pairs, key=lambda p: p[1])[0]
+                else:
+                    winner_idx = min(numeric_pairs, key=lambda p: p[1])[0]
+                # If all values are equal, no winner (avoid arbitrary-tiebreak gold star)
+                vals_set = {v for _, v in numeric_pairs}
+                if len(vals_set) <= 1:
+                    winner_idx = None
+        cells = []
+        for i, v in enumerate(values):
+            cls = " class='lc-cmp-winner'" if i == winner_idx else ""
+            cells.append(f"<td{cls}>{fmt(v)}</td>")
+        label = label_zh if is_zh else label_en
+        # Tag info rows so the eye knows "this is a model fact, not a contest".
+        is_info = better == "info"
+        label_cls = "lc-cmp-row-label lc-cmp-row-info" if is_info else "lc-cmp-row-label"
+        tr_cls = " class='lc-cmp-tr-info'" if is_info else ""
+        rows_html.append(
+            f"<tr{tr_cls}><th class='{label_cls}'>{_esc(label)}</th>{''.join(cells)}</tr>"
+        )
+    # Aggregate winner — count column wins across "higher/lower" metrics
+    win_counts = [0] * len(reports)
+    for label_en, label_zh, getter, better, fmt in metrics:
+        if better == "info":
+            continue
+        values = [getter(r) for r in reports]
+        numeric_pairs = [(i, v) for i, v in enumerate(values) if isinstance(v, (int, float))]
+        if not numeric_pairs:
+            continue
+        vals_set = {v for _, v in numeric_pairs}
+        if len(vals_set) <= 1:
+            continue
+        if better == "higher":
+            winner_idx = max(numeric_pairs, key=lambda p: p[1])[0]
+        else:
+            winner_idx = min(numeric_pairs, key=lambda p: p[1])[0]
+        win_counts[winner_idx] += 1
+    overall_text = ""
+    if any(win_counts):
+        max_wins = max(win_counts)
+        leaders = [reports[i].gpu for i, c in enumerate(win_counts) if c == max_wins]
+        if len(leaders) == 1:
+            overall_text = (
+                f"<div class='lc-cmp-summary'>"
+                f"{'综合最优' if is_zh else 'Overall winner'}: "
+                f"<strong>{_esc(leaders[0])}</strong> "
+                f"({max_wins}/{sum(1 for m in metrics if m[3] != 'info')} "
+                f"{'指标领先' if is_zh else 'metrics lead'})"
+                f"</div>"
+            )
+        else:
+            overall_text = (
+                f"<div class='lc-cmp-summary'>"
+                f"{'势均力敌' if is_zh else 'Tied'}: "
+                f"<strong>{_esc(' / '.join(leaders))}</strong>"
+                f"</div>"
+            )
+    table = (
+        f"<div class='lc-section'>"
+        f"<h3>{'对比' if is_zh else 'Side-by-side comparison'}</h3>"
+        f"<div class='lc-cmp-wrap'>"
+        f"<table class='lc-cmp-table'>"
+        f"<thead><tr>"
+        f"<th class='lc-cmp-row-label'></th>"
+        f"{gpu_headers}"
+        f"</tr></thead>"
+        f"<tbody>{''.join(rows_html)}</tbody>"
+        f"</table></div>"
+        f"{overall_text}"
+        f"</div>"
+    )
+    # Per-GPU detail headlines (small stat cards) below the table
+    detail_blocks = []
+    for r in reports:
+        weight_str = _fmt_bytes(r.weight.total_bytes.value)
+        prod = _prod_gpu_count(r)
+        users = _max_concurrent(r)
+        detail_blocks.append(
+            f"<div class='lc-cmp-detail'>"
+            f"<div class='lc-cmp-detail-gpu'>{_esc(r.gpu)}</div>"
+            f"<div class='lc-cmp-detail-row'>"
+            f"<span>{'权重' if is_zh else 'Weight'}</span><strong>{weight_str}</strong></div>"
+            f"<div class='lc-cmp-detail-row'>"
+            f"<span>{'生产 GPU' if is_zh else 'Prod GPUs'}</span>"
+            f"<strong>{prod if prod is not None else '—'}</strong></div>"
+            f"<div class='lc-cmp-detail-row'>"
+            f"<span>{'用户 @ 128K' if is_zh else 'Users @ 128K'}</span>"
+            f"<strong>{users if users is not None else '—'}</strong></div>"
+            f"</div>"
+        )
+    detail_section = (
+        f"<div class='lc-section'>"
+        f"<h3>{'各档详情' if is_zh else 'Per-GPU detail'}</h3>"
+        f"<div class='lc-cmp-details'>{''.join(detail_blocks)}</div>"
+        f"</div>"
+    )
+    return (
+        "<div class='lc-result'>"
+        + title
+        + table
+        + detail_section
+        + _render_star_cta(is_zh)
+        + "</div>"
+    )
+def _render_star_cta(is_zh: bool) -> str:
+    """Tail-of-result CTA — shown right after the user got their answer,
+    which is when satisfaction is highest and the GitHub star ask reads as
+    'thanks for the tool' rather than 'please give me attention'."""
+    en_msg = "Saved you GPU-sizing math?"
+    zh_msg = "省了你 GPU 选型的时间？"
+    cta_en = "Star on GitHub"
+    cta_zh = "给个 Star"
+    text_top = zh_msg if is_zh else en_msg
+    text_bottom = en_msg if is_zh else zh_msg
+    cta = f"{cta_zh if is_zh else cta_en} · {cta_en if is_zh else cta_zh}"
+    return (
+        "<a class='lc-star-cta' href='https://github.com/FlyTOmeLight/llm-cal' "
+        "target='_blank' rel='noopener'>"
+        "<svg viewBox='0 0 16 16' width='18' height='18' aria-hidden='true' fill='currentColor'>"
+        "<path d='M8 0C3.58 0 0 3.58 0 8a8 8 0 0 0 5.47 7.59c.4.07.55-.17.55-.38v-1.33c-2.22.48-2.69-1.07-2.69-1.07-.36-.92-.89-1.17-.89-1.17-.73-.5.06-.49.06-.49.81.06 1.23.83 1.23.83.72 1.23 1.88.87 2.34.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.83-2.15-.08-.2-.36-1.02.08-2.13 0 0 .67-.21 2.2.82a7.6 7.6 0 0 1 4 0c1.53-1.04 2.2-.82 2.2-.82.44 1.11.16 1.93.08 2.13.51.56.83 1.27.83 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48v2.19c0 .21.15.46.55.38A8 8 0 0 0 16 8c0-4.42-3.58-8-8-8z'/>"
+        "</svg>"
+        f"<div class='lc-star-cta-text'>"
+        f"<div class='lc-star-cta-q'>{text_top}</div>"
+        f"<div class='lc-star-cta-q-en'>{text_bottom}</div>"
+        f"</div>"
+        f"<div class='lc-star-cta-action'>{cta} →</div>"
+        "</a>"
+    )
+def _render_explain(entries: list[ExplainEntry], is_zh: bool) -> str:
+    """Render --explain derivation trace as an HTML accordion."""
+    if not entries:
+        return ""
+    blocks = []
+    for e in entries:
+        inputs_html = ""
+        if e.inputs:
+            inputs_html = "<ul class='lc-explain-inputs'>" + "".join(
+                f"<li><code>{_esc(inp.name)}</code> = "
+                f"<strong>{_esc(inp.value)}</strong> "
+                f"<span class='lc-explain-label'>{_esc(inp.label)}</span>"
+                + (f" — <em>{_esc(inp.note)}</em>" if inp.note else "")
+                + "</li>"
+                for inp in e.inputs
+            ) + "</ul>"
+        steps_html = ""
+        if e.steps:
+            steps_html = "<ol class='lc-explain-steps'>" + "".join(
+                f"<li>{_esc(s)}</li>" for s in e.steps
+            ) + "</ol>"
+        source_html = (
+            f"<div class='lc-prov'>{'来源' if is_zh else 'Source'}: {_esc(e.source)}</div>"
+            if e.source
+            else ""
+        )
+        blocks.append(
+            f"<div class='lc-explain-entry'>"
+            f"<div class='lc-explain-heading'>{_esc(e.heading)}</div>"
+            f"<div class='lc-explain-formula'><code>{_esc(e.formula)}</code></div>"
+            f"{inputs_html}{steps_html}"
+            f"<div class='lc-explain-result'>"
+            f"{'结果' if is_zh else 'Result'}: <strong>{_esc(e.result)}</strong></div>"
+            f"{source_html}"
+            f"</div>"
+        )
+    return (
+        "<div class='lc-result'>"
+        f"<div class='lc-section'>"
+        f"<h3>{'推导链 (--explain)' if is_zh else 'Derivation trace (--explain)'}</h3>"
+        + "".join(blocks)
+        + "</div></div>"
+    )
+def _render_llm_review(content: str | None, error: str | None, model: str, is_zh: bool) -> str:
+    if error:
+        return _render_error(f"LLM review: {error}", is_zh)
+    if not content:
+        return ""
+    # The LLM responds with markdown — convert to a simple HTML block for display.
+    # gr.HTML doesn't run markdown, but the LLM's headers (## ...) still read OK as text.
+    safe = _esc(content).replace("\n", "<br>")
+    return (
+        "<div class='lc-result'>"
+        f"<div class='lc-section'>"
+        f"<h3>{'LLM 审计 (--llm-review)' if is_zh else 'LLM review (--llm-review)'} "
+        f"<span class='lc-llm-model'>{_esc(model)}</span></h3>"
+        f"<div class='lc-llm-banner'>"
+        f"{_label_chip('llm-opinion')} "
+        f"{'仅供参考，不覆盖前 6 个 label' if is_zh else 'Second opinion — never overrides the 6 primary labels'}"
+        f"</div>"
+        f"<div class='lc-llm-content'>{safe}</div>"
+        f"</div></div>"
+    )
+def _render_error(msg: str, is_zh: bool) -> str:
+    label = "出错了" if is_zh else "Error"
+    return (
+        f"<div class='lc-result lc-error'>"
+        f"<h3>{label}</h3>"
+        f"<pre>{_esc(msg)}</pre></div>"
+    )
+def _render_loading(is_zh: bool) -> str:
+    msg = (
+        "正在拉取模型元数据 + 读 safetensors header… 首次大模型约 3-8 秒"
+        if is_zh
+        else "Fetching model metadata + reading safetensors header… "
+        "first lookup of a large model takes 3-8 seconds"
+    )
+    return (
+        "<div class='lc-result lc-loading'>"
+        "<div class='lc-spinner'></div>"
+        f"<div class='lc-loading-text'>{msg}</div>"
+        "</div>"
+    )
+# ---------------------------------------------------------------------------
+# Backend handler
+_evaluators: dict[str, Evaluator] = {}
+def _get_evaluator(source_key: str) -> Evaluator:
+    """One evaluator per source — Evaluator caches an HfApi client internally
+    so we don't want to rebuild it every keystroke."""
+    if source_key not in _evaluators:
+        if source_key == "modelscope":
+            _evaluators[source_key] = Evaluator(source=ModelScopeSource())
+        else:
+            _evaluators[source_key] = Evaluator(source=HuggingFaceSource())
+    return _evaluators[source_key]
+def calculate(
+    model_id: str,
+    gpu,  # list[str] from multiselect; str also tolerated  # noqa: ANN001
+    engine: str,
+    context_length: int | None,
+    lang: str,
+    source: str,
+    gpu_count: int | None,
+    input_tokens: int,
+    output_tokens: int,
+    target_tps: float,
+    prefill_util: float,
+    decode_bw_util: float,
+    concurrency_degradation: float,
+    refresh: bool,
+    explain: bool,
+    llm_review: bool,
+    hf_token: str,
+    ms_token: str,
+    llm_api_key: str,
+    llm_base_url: str,
+    llm_model: str,
+) -> tuple[str, str, str]:
+    """Returns (main_html, explain_html, llm_review_html)."""
+    locale = "zh" if lang.startswith("中") else "en"
+    is_zh = locale == "zh"
+    # Normalize GPU input. Multiselect returns list; defensive coerce for safety.
+    if isinstance(gpu, str):
+        gpu_list = [gpu] if gpu else []
+    elif isinstance(gpu, (list, tuple)):
+        gpu_list = [g for g in gpu if g]
+    else:
+        gpu_list = []
+    if not model_id or not model_id.strip():
+        return (
+            _render_error(
+                "请输入模型 ID" if is_zh else "Enter a model id",
+                is_zh,
+            ),
+            "",
+            "",
+        )
+    if not gpu_list:
+        return (_render_error("请选择 GPU" if is_zh else "Pick a GPU", is_zh), "", "")
+    is_compare = len(gpu_list) >= 2
+    # Resolve source key. The radio shows e.g. "HuggingFace" / "ModelScope".
+    src_key = "modelscope" if "modelscope" in source.lower() else "huggingface"
+    # Inject user-provided tokens into env for the duration of this call only.
+    # We restore the prior values in the finally block so a token entered for
+    # one model doesn't leak into the next request from a different user.
+    token_env_keys = (
+        "HF_TOKEN",
+        "HUGGING_FACE_HUB_TOKEN",
+        "MODELSCOPE_API_TOKEN",
+        "MODELSCOPE_TOKEN",
+    )
+    old_token_env = {k: os.environ.get(k) for k in token_env_keys}
+    if hf_token and hf_token.strip():
+        os.environ["HF_TOKEN"] = hf_token.strip()
+    if ms_token and ms_token.strip():
+        os.environ["MODELSCOPE_API_TOKEN"] = ms_token.strip()
+    def _eval_one(g: str) -> EvaluationReport:
+        return _get_evaluator(src_key).evaluate(
+            model_id=model_id.strip(),
+            gpu=g,
+            engine=engine,
+            gpu_count=gpu_count if gpu_count and gpu_count > 0 else None,
+            context_length=context_length if context_length and context_length > 0 else None,
+            refresh=refresh,
+            input_tokens=int(input_tokens) if input_tokens else 2000,
+            output_tokens=int(output_tokens) if output_tokens else 512,
+            target_tokens_per_sec=float(target_tps) if target_tps else 30.0,
+            prefill_utilization=float(prefill_util) if prefill_util else 0.40,
+            decode_bw_utilization=float(decode_bw_util) if decode_bw_util else 0.50,
+            concurrency_degradation=(
+                float(concurrency_degradation) if concurrency_degradation else 1.0
+            ),
+        )
+    try:
+        # ---- Compare path: 2-4 GPUs --------------------------------------
+        if is_compare:
+            try:
+                reports = [_eval_one(g) for g in gpu_list]
+            except Exception as e:  # noqa: BLE001
+                return (_render_error(f"{type(e).__name__}: {e}", is_zh), "", "")
+            return _render_compare(reports, locale), "", ""
+        # ---- Single-GPU path (existing flow) ------------------------------
+        try:
+            report = _eval_one(gpu_list[0])
+        except Exception as e:  # noqa: BLE001
+            return (_render_error(f"{type(e).__name__}: {e}", is_zh), "", "")
+        main_html = _render(report, locale)
+        explain_html = ""
+        llm_html = ""
+        if explain or llm_review:
+            entries = build_explain(report)
+            if explain:
+                explain_html = _render_explain(entries, is_zh)
+            if llm_review:
+                # Only set env vars if user actually provided them — never persist
+                # them in env beyond this call's scope (they live in process env
+                # for the duration of the call, but we don't persist to disk).
+                old_env = {
+                    "LLM_CAL_REVIEWER_API_KEY": os.environ.get("LLM_CAL_REVIEWER_API_KEY"),
+                    "LLM_CAL_REVIEWER_BASE_URL": os.environ.get("LLM_CAL_REVIEWER_BASE_URL"),
+                    "LLM_CAL_REVIEWER_MODEL": os.environ.get("LLM_CAL_REVIEWER_MODEL"),
+                }
+                try:
+                    if llm_api_key.strip():
+                        os.environ["LLM_CAL_REVIEWER_API_KEY"] = llm_api_key.strip()
+                    if llm_base_url.strip():
+                        os.environ["LLM_CAL_REVIEWER_BASE_URL"] = llm_base_url.strip()
+                    if llm_model.strip():
+                        os.environ["LLM_CAL_REVIEWER_MODEL"] = llm_model.strip()
+                    result = run_review(entries, locale=locale)  # type: ignore[arg-type]
+                finally:
+                    for k, v in old_env.items():
+                        if v is None:
+                            os.environ.pop(k, None)
+                        else:
+                            os.environ[k] = v
+                llm_html = _render_llm_review(result.content, result.error, result.model, is_zh)
+        return main_html, explain_html, llm_html
+    finally:
+        for k, v in old_token_env.items():
+            if v is None:
+                os.environ.pop(k, None)
+            else:
+                os.environ[k] = v
+def show_loading(lang: str) -> tuple[str, str, str]:
+    is_zh = lang.startswith("中")
+    return _render_loading(is_zh), "", ""
+# ---------------------------------------------------------------------------
+# UI
+THEME = gr.themes.Soft(primary_hue="indigo")
+HERO_HTML = """
+<div class='lc-hero'>
+  <div class='lc-hero-top'>
+    <div class='lc-hero-titleblock'>
+      <div class='lc-hero-title'>llm-cal</div>
+      <div class='lc-hero-tagline'>
+        LLM inference hardware calculator · 大模型推理硬件计算器<br>
+        Architecture-aware · Engine-aware · <strong>Honest-labeled</strong>
+      </div>
+    </div>
+    <a class='lc-hero-gh' href='https://github.com/FlyTOmeLight/llm-cal' target='_blank' rel='noopener'>
+      <svg viewBox='0 0 16 16' width='16' height='16' aria-hidden='true' fill='currentColor'>
+        <path d='M8 0C3.58 0 0 3.58 0 8a8 8 0 0 0 5.47 7.59c.4.07.55-.17.55-.38v-1.33c-2.22.48-2.69-1.07-2.69-1.07-.36-.92-.89-1.17-.89-1.17-.73-.5.06-.49.06-.49.81.06 1.23.83 1.23.83.72 1.23 1.88.87 2.34.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.83-2.15-.08-.2-.36-1.02.08-2.13 0 0 .67-.21 2.2.82a7.6 7.6 0 0 1 4 0c1.53-1.04 2.2-.82 2.2-.82.44 1.11.16 1.93.08 2.13.51.56.83 1.27.83 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48v2.19c0 .21.15.46.55.38A8 8 0 0 0 16 8c0-4.42-3.58-8-8-8z'/>
+      </svg>
+      <span class='lc-hero-gh-text'>GitHub</span>
+      <img class='lc-hero-gh-stars' alt='stars'
+        src='https://img.shields.io/github/stars/FlyTOmeLight/llm-cal?style=flat-square&logo=&label=&color=eef2ff&labelColor=eef2ff'
+        loading='lazy' />
+    </a>
+  </div>
+  <div class='lc-hero-pitch'>
+    <div class='lc-pitch-card lc-pitch-bad'>
+      <div class='lc-pitch-tool'>gpu_poor</div>
+      <div class='lc-pitch-num-bad'>284 GB</div>
+      <div class='lc-pitch-method'>assumes pure FP8 · 假设纯 FP8</div>
+    </div>
+    <div class='lc-pitch-arrow'>→</div>
+    <div class='lc-pitch-card lc-pitch-good'>
+      <div class='lc-pitch-tool'>llm-cal</div>
+      <div class='lc-pitch-num-good'>160 GB</div>
+      <div class='lc-pitch-method'>reads real safetensors bytes · 读真实字节</div>
+    </div>
+    <div class='lc-pitch-summary'>
+      <div class='lc-pitch-model'>DeepSeek-V4-Flash · H800</div>
+      <div class='lc-pitch-result'>0.2% error vs 45% · 误差 0.2% vs 45%</div>
+    </div>
+  </div>
+</div>
+"""
+CUSTOM_CSS = """
+/* Font stack — system fonts in both English + Chinese, no Gradio default serif */
+* {
+  font-family: -apple-system, BlinkMacSystemFont, "Inter", "Helvetica Neue",
+    "PingFang SC", "Microsoft YaHei", "Segoe UI", Roboto, Arial, sans-serif !important;
+}
+/* Hide Gradio's default footer chrome that looks like part of our app */
+footer { display: none !important; }
+.show-api, .built-with, .settings { display: none !important; }
+/* Tighter overall padding + center on wide screens — without margin:auto the
+   container left-aligns and leaves ~800px empty on 1920+ displays.
+   width:100% makes it shrink to viewport when narrower than max-width
+   (otherwise on mobile align-items:stretch + max-width overflows). */
+.gradio-container {
+  max-width: 1100px !important;
+  width: 100% !important;
+  margin-left: auto !important;
+  margin-right: auto !important;
+}
+/* Hero section */
+.lc-hero {
+  margin: 8px 0 24px 0;
+  padding: 24px 0 18px 0;
+  border-bottom: 1px solid #e5e7eb;
+}
+.dark .lc-hero { border-bottom-color: #374151; }
+/* Top row: title block (left) + GitHub link (right). On mobile the GH link
+   wraps to its own line above or below the title — order kept so it stays
+   visible above the fold. */
+.lc-hero-top {
+  display: flex;
+  align-items: flex-start;
+  justify-content: space-between;
+  gap: 16px;
+  flex-wrap: wrap;
+  margin-bottom: 14px;
+}
+.lc-hero-titleblock {
+  flex: 1 1 320px;
+  min-width: 0;
+}
+.lc-hero-gh {
+  display: inline-flex;
+  align-items: center;
+  gap: 8px;
+  padding: 6px 12px;
+  border: 1px solid #c7d2fe;
+  background: #eef2ff;
+  border-radius: 999px;
+  font-size: 13px !important;
+  font-weight: 600 !important;
+  color: #4338ca !important;
+  text-decoration: none !important;
+  white-space: nowrap;
+  transition: background 0.15s ease, border-color 0.15s ease;
+  flex: 0 0 auto;
+}
+.lc-hero-gh:hover {
+  background: #e0e7ff;
+  border-color: #a5b4fc;
+}
+.dark .lc-hero-gh {
+  background: #1e1b4b;
+  border-color: #3730a3;
+  color: #c7d2fe !important;
+}
+.dark .lc-hero-gh:hover { background: #312e81; border-color: #4338ca; }
+.lc-hero-gh svg { display: block; }
+.lc-hero-gh-stars {
+  height: 18px;
+  vertical-align: middle;
+  border-radius: 4px;
+}
+.lc-hero-title {
+  font-size: 32px !important;
+  font-weight: 800 !important;
+  letter-spacing: -0.02em;
+  color: #0f172a !important;
+  margin: 0 !important;
+  line-height: 1.15;
+}
+.dark .lc-hero-title { color: #f8fafc !important; }
+.lc-hero-tagline {
+  font-size: 16px !important;
+  color: #6b7280 !important;
+  margin: 6px 0 16px 0;
+  line-height: 1.5;
+}
+.lc-hero-pitch {
+  display: grid;
+  /* 4 cells: bad-card / arrow / good-card / summary on wide screens */
+  grid-template-columns: 1fr 30px 1fr 1.2fr;
+  gap: 14px;
+  align-items: stretch;
+  padding: 0;
+  font-size: 13px !important;
+  color: #1e293b !important;
+}
+.dark .lc-hero-pitch { color: #f1f5f9 !important; }
+/* Tablet: bad / arrow / good in row 1, summary full-width row 2 */
+@media (max-width: 900px) {
+  .lc-hero-pitch {
+    grid-template-columns: 1fr 28px 1fr;
+    grid-template-rows: auto auto;
+  }
+  .lc-pitch-summary { grid-column: 1 / -1; }
+}
+/* Mobile: stack everything, hide the arrow */
+@media (max-width: 540px) {
+  .lc-hero-pitch {
+    grid-template-columns: 1fr;
+    grid-template-rows: repeat(3, auto);
+  }
+  .lc-pitch-arrow { display: none; }
+  .lc-pitch-summary { grid-column: auto; }
+}
+.lc-pitch-card {
+  padding: 14px 18px;
+  border-radius: 10px;
+  border: 1px solid #e5e7eb;
+  background: #ffffff;
+  display: flex;
+  flex-direction: column;
+  justify-content: center;
+  min-width: 0;
+}
+.dark .lc-pitch-card { background: #111827; border-color: #374151; }
+/* Subtle accent bar on the left, not a screaming red/green border */
+.lc-pitch-bad  { border-left: 3px solid #cbd5e1; }
+.lc-pitch-good { border-left: 3px solid #4f46e5; }
+.dark .lc-pitch-bad  { border-left-color: #475569; }
+.dark .lc-pitch-good { border-left-color: #818cf8; }
+.lc-pitch-tool {
+  font-size: 12px !important;
+  font-weight: 600 !important;
+  color: #6b7280 !important;
+  font-family: "SF Mono", "JetBrains Mono", Menlo, monospace !important;
+  margin-bottom: 4px;
+}
+.lc-pitch-num-bad  { font-size: 24px !important; font-weight: 800 !important; color: #b91c1c !important; line-height: 1.1; letter-spacing: -0.01em; }
+.lc-pitch-num-good { font-size: 24px !important; font-weight: 800 !important; color: #15803d !important; line-height: 1.1; letter-spacing: -0.01em; }
+.dark .lc-pitch-num-bad  { color: #f87171 !important; }
+.dark .lc-pitch-num-good { color: #4ade80 !important; }
+.lc-pitch-method {
+  font-size: 11px !important;
+  color: #6b7280 !important;
+  margin-top: 6px;
+  line-height: 1.4;
+}
+.lc-pitch-arrow {
+  display: flex;
+  align-items: center;
+  font-size: 22px !important;
+  color: #9ca3af !important;
+  font-weight: 300;
+}
+.lc-pitch-summary {
+  flex: 1 1 200px;
+  padding: 14px 18px;
+  border-radius: 10px;
+  background: #eef2ff;
+  border: 1px solid #c7d2fe;
+  display: flex;
+  flex-direction: column;
+  justify-content: center;
+}
+.dark .lc-pitch-summary { background: #1e1b4b; border-color: #3730a3; }
+.lc-pitch-model {
+  font-size: 11px !important;
+  font-weight: 600 !important;
+  text-transform: uppercase;
+  letter-spacing: 0.06em;
+  color: #6366f1 !important;
+  margin-bottom: 4px;
+}
+.dark .lc-pitch-model { color: #a5b4fc !important; }
+.lc-pitch-result {
+  font-size: 14px !important;
+  font-weight: 700 !important;
+  color: #312e81 !important;
+}
+.dark .lc-pitch-result { color: #e0e7ff !important; }
+/* Primary button — match the indigo theme; constrain width so it's not a billboard */
+button.primary,
+button[variant="primary"],
+.primary > button {
+  background: #4f46e5 !important;
+  border-color: #4f46e5 !important;
+  color: #ffffff !important;
+  font-weight: 600 !important;
+  letter-spacing: 0.01em;
+  border-radius: 8px !important;
+  padding: 10px 28px !important;
+}
+button.primary:hover,
+button[variant="primary"]:hover,
+.primary > button:hover { background: #4338ca !important; border-color: #4338ca !important; }
+/* The wrapper around the Calculate button — center it, give it sane width */
+.lc-submit-wrap {
+  display: flex !important;
+  justify-content: center !important;
+  margin: 20px 0 8px 0 !important;
+}
+.lc-submit-wrap button {
+  min-width: 220px !important;
+  max-width: 320px !important;
+  width: auto !important;
+}
+/* Form labels — kill Gradio's purple chip; make labels plain uppercase small text */
+[data-testid="block-info"] {
+  background: transparent !important;
+  border: none !important;
+  padding: 0 !important;
+  margin: 0 0 6px 0 !important;
+  font-size: 11px !important;
+  font-weight: 600 !important;
+  text-transform: uppercase !important;
+  letter-spacing: 0.05em !important;
+  color: #6b7280 !important;
+  border-radius: 0 !important;
+  display: block !important;
+}
+.dark [data-testid="block-info"] { color: #9ca3af !important; }
+/* Tooltip / info-text — single line, secondary color, no italic */
+.info-text {
+  font-size: 11px !important;
+  color: #94a3b8 !important;
+  margin: 0 0 4px 0 !important;
+  line-height: 1.4 !important;
+  padding: 0 !important;
+  font-style: normal !important;
+  white-space: normal !important;
+}
+.info-text br { display: none !important; }
+.dark .info-text { color: #64748b !important; }
+/* Kill Gradio's grey form-panel chrome entirely — labels + inputs float on the page */
+.block,
+.block.padded,
+.block.gradio-container,
+.form,
+.row,
+[data-testid="block"] {
+  background: transparent !important;
+  border: none !important;
+  box-shadow: none !important;
+}
+.block.padded { padding: 6px 0 !important; }
+.form { padding: 0 !important; }
+.row { padding: 0 !important; }
+/* Tighten row gap so inputs cluster more naturally */
+.form, .row { gap: 16px !important; }
+/* Tablet (≤900px): Gradio's gr.Row() flex-direction: row keeps 3 inputs
+   in one line. min-width: 320px forces 3-column rows to wrap to 2x1 +
+   1x1 at this size while leaving 2-column rows at 2-up. */
+@media (max-width: 900px) {
+  .form,
+  .row {
+    flex-wrap: wrap !important;
+  }
+  .form > .block,
+  .row > .block {
+    flex: 1 1 calc(50% - 12px) !important;
+    min-width: 320px !important;
+    max-width: 100% !important;
+  }
+}
+/* Mobile (≤540px): single-column form. */
+@media (max-width: 540px) {
+  .form,
+  .row {
+    flex-direction: column !important;
+  }
+  .form > .block,
+  .row > .block {
+    flex: 1 1 100% !important;
+    min-width: 0 !important;
+    width: 100% !important;
+  }
+  .gradio-container { padding: 12px !important; }
+  .lc-hero-title { font-size: 26px !important; }
+  .lc-pitch-num-bad, .lc-pitch-num-good { font-size: 22px !important; }
+  .lc-pitch-arrow { display: none !important; }
+}
+/* Inputs themselves — light border, soft fill */
+input[type="text"],
+input[type="number"],
+input[type="password"],
+textarea,
+select {
+  border: 1px solid #e5e7eb !important;
+  border-radius: 8px !important;
+  background: #ffffff !important;
+  font-size: 14px !important;
+  padding: 10px 12px !important;
+}
+.dark input,
+.dark textarea,
+.dark select {
+  background: #111827 !important;
+  border-color: #374151 !important;
+}
+input:focus,
+textarea:focus {
+  border-color: #4f46e5 !important;
+  outline: none !important;
+  box-shadow: 0 0 0 3px rgba(79,70,229,0.12) !important;
+}
+/* Accordion — Gradio 6 has no .accordion class; the only signal is a .block
+   that *contains* a button.label-wrap. Use :has() to match precisely. */
+.block.padded:has(> button.label-wrap) {
+  background: #ffffff !important;
+  border: 1px solid #e5e7eb !important;
+  border-radius: 10px !important;
+  margin: 14px 0 !important;
+  padding: 0 !important;
+  overflow: hidden !important;
+}
+.dark .block.padded:has(> button.label-wrap) {
+  background: #111827 !important;
+  border-color: #374151 !important;
+}
+button.label-wrap {
+  background: #f8fafc !important;
+  padding: 14px 18px !important;
+  font-weight: 600 !important;
+  font-size: 14px !important;
+  color: #1f2937 !important;
+  width: 100% !important;
+  text-align: left !important;
+  cursor: pointer !important;
+  border: none !important;
+  border-bottom: 1px solid #e5e7eb !important;
+  display: flex !important;
+  justify-content: space-between !important;
+  align-items: center !important;
+  letter-spacing: 0.01em;
+}
+.dark button.label-wrap {
+  background: #1e293b !important;
+  color: #f1f5f9 !important;
+  border-bottom-color: #374151 !important;
+}
+button.label-wrap:hover { background: #f1f5f9 !important; }
+.dark button.label-wrap:hover { background: #334155 !important; }
+/* Sibling content of the header (the body when expanded) */
+.block.padded:has(> button.label-wrap) > *:not(button.label-wrap) {
+  padding: 16px 18px !important;
+  background: #ffffff !important;
+}
+.dark .block.padded:has(> button.label-wrap) > *:not(button.label-wrap) {
+  background: #111827 !important;
+}
+/* gr.Examples table — the default Gradio render is a raw HTML table with black
+   borders and no hover state. Style it to match the rest of the page. */
+.gradio-dataset,
+[data-testid="dataset"] {
+  margin-top: 24px !important;
+  background: transparent !important;
+  border: none !important;
+}
+.gradio-dataset table,
+[data-testid="dataset"] table {
+  border-collapse: collapse !important;
+  border: 1px solid #e5e7eb !important;
+  border-radius: 8px !important;
+  overflow: hidden !important;
+  font-size: 13px !important;
+  width: 100% !important;
+}
+.dark .gradio-dataset table,
+.dark [data-testid="dataset"] table { border-color: #374151 !important; }
+.gradio-dataset thead,
+[data-testid="dataset"] thead { background: #f9fafb !important; }
+.dark .gradio-dataset thead,
+.dark [data-testid="dataset"] thead { background: #111827 !important; }
+.gradio-dataset th,
+[data-testid="dataset"] th {
+  font-size: 11px !important;
+  font-weight: 600 !important;
+  text-transform: uppercase !important;
+  letter-spacing: 0.05em !important;
+  color: #6b7280 !important;
+  text-align: left !important;
+  padding: 10px 12px !important;
+  border: none !important;
+  border-bottom: 1px solid #e5e7eb !important;
+}
+.gradio-dataset td,
+[data-testid="dataset"] td {
+  padding: 9px 12px !important;
+  border: none !important;
+  border-bottom: 1px solid #f3f4f6 !important;
+  color: #1f2937 !important;
+  font-size: 13px !important;
+  background: transparent !important;
+  cursor: pointer !important;
+}
+.dark .gradio-dataset td,
+.dark [data-testid="dataset"] td {
+  color: #e5e7eb !important;
+  border-bottom-color: #1f2937 !important;
+}
+.gradio-dataset tbody tr:last-child td,
+[data-testid="dataset"] tbody tr:last-child td { border-bottom: none !important; }
+.gradio-dataset tbody tr:hover,
+[data-testid="dataset"] tbody tr:hover { background: rgba(79, 70, 229, 0.04) !important; }
+.dark .gradio-dataset tbody tr:hover,
+.dark [data-testid="dataset"] tbody tr:hover { background: rgba(129, 140, 248, 0.08) !important; }
+/* Examples header label — Gradio puts a "Try one of these" label above */
+.gradio-dataset > .label,
+[data-testid="dataset"] > .label,
+.gradio-dataset .block-label,
+.dataset .block-label {
+  font-size: 11px !important;
+  font-weight: 600 !important;
+  text-transform: uppercase !important;
+  letter-spacing: 0.06em !important;
+  color: #6b7280 !important;
+  background: transparent !important;
+  border: none !important;
+  padding: 0 0 6px 0 !important;
+  margin-bottom: 0 !important;
+}
+/* Footer link strip */
+.lc-footer {
+  margin-top: 28px;
+  padding: 14px 0;
+  border-top: 1px solid #e5e7eb;
+  font-size: 13px !important;
+  color: #6b7280 !important;
+}
+.dark .lc-footer { border-top-color: #374151; }
+.lc-footer a { color: #4f46e5 !important; text-decoration: none; }
+.lc-footer a:hover { text-decoration: underline; }
+.dark .lc-footer a { color: #818cf8 !important; }
+/* Result wrapper */
+.lc-result {
+  padding: 4px 0;
+  font-size: 14px;
+  line-height: 1.55;
+  color: #111827 !important;
+}
+.dark .lc-result { color: #f3f4f6 !important; }
+/* Headline */
+.lc-header { padding: 4px 0 14px 0; border-bottom: 1px solid #e5e7eb; }
+.dark .lc-header { border-bottom-color: #374151; }
+.lc-title {
+  font-size: 22px !important;
+  font-weight: 700 !important;
+  letter-spacing: -0.01em;
+  color: #0f172a !important;
+}
+.dark .lc-title { color: #f8fafc !important; }
+.lc-subtitle {
+  font-size: 13px !important;
+  color: #6b7280 !important;
+  margin-top: 2px;
+}
+/* Headline stat cards */
+.lc-stats {
+  display: grid;
+  grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
+  gap: 12px;
+  margin: 16px 0 8px 0;
+}
+.lc-stat {
+  border: 1px solid #e5e7eb;
+  border-radius: 10px;
+  padding: 14px 16px;
+  background: #ffffff;
+}
+.dark .lc-stat { background: #111827; border-color: #374151; }
+.lc-stat-value {
+  font-size: 24px !important;
+  font-weight: 700 !important;
+  letter-spacing: -0.01em;
+  line-height: 1.2;
+  color: #0f172a !important;
+}
+.dark .lc-stat-value { color: #f8fafc !important; }
+.lc-stat-label {
+  font-size: 11px !important;
+  text-transform: uppercase;
+  letter-spacing: 0.05em;
+  color: #6b7280 !important;
+  margin-top: 4px;
+  font-weight: 500 !important;
+}
+.lc-stat-sub {
+  font-size: 11px !important;
+  color: #9ca3af !important;
+  margin-top: 2px;
+}
+.lc-stat-chip { margin-top: 10px; }
+.lc-chip {
+  display: inline-block;
+  padding: 2px 8px;
+  border-radius: 999px;
+  font-size: 11px !important;
+  font-weight: 600 !important;
+  letter-spacing: 0.02em;
+}
+.lc-prov {
+  margin-top: 6px;
+  font-size: 12px !important;
+  color: #6b7280 !important;
+  font-style: italic;
+}
+/* Sections */
+.lc-section { margin: 24px 0 0 0; }
+.lc-section h3 {
+  font-size: 13px !important;
+  font-weight: 600 !important;
+  text-transform: uppercase;
+  letter-spacing: 0.06em;
+  color: #6b7280 !important;
+  margin: 0 0 6px 0 !important;
+}
+.lc-section-help {
+  font-size: 12px !important;
+  color: #6b7280 !important;
+  margin: 0 0 10px 0;
+  line-height: 1.5;
+}
+/* Tables */
+.lc-table {
+  width: 100%;
+  border-collapse: collapse;
+  font-size: 13px !important;
+  color: #111827 !important;
+}
+.dark .lc-table { color: #f3f4f6 !important; }
+.lc-table th, .lc-table td {
+  padding: 8px 10px;
+  border-bottom: 1px solid #f3f4f6;
+  text-align: left;
+}
+.dark .lc-table th, .dark .lc-table td { border-bottom-color: #1f2937; }
+.lc-table th {
+  font-size: 11px !important;
+  text-transform: uppercase;
+  letter-spacing: 0.04em;
+  color: #6b7280 !important;
+  font-weight: 500 !important;
+}
+.lc-table-recon td:nth-child(2),
+.lc-table-recon td:nth-child(3) { text-align: right; }
+.lc-best { background: rgba(22, 163, 74, 0.08); }
+.dark .lc-best { background: rgba(22, 163, 74, 0.18); }
+/* Performance grid */
+.lc-perf {
+  display: grid;
+  grid-template-columns: repeat(auto-fit, minmax(170px, 1fr));
+  gap: 12px;
+}
+.lc-perf-item {
+  border: 1px solid #e5e7eb;
+  border-radius: 10px;
+  padding: 12px 14px;
+  background: #ffffff;
+}
+.dark .lc-perf-item { border-color: #374151; background: #111827; }
+.lc-perf-value {
+  font-size: 20px !important;
+  font-weight: 700 !important;
+  letter-spacing: -0.01em;
+  color: #0f172a !important;
+  line-height: 1.2;
+}
+.dark .lc-perf-value { color: #f8fafc !important; }
+.lc-perf-value code {
+  font-size: 16px !important;
+  font-weight: 600 !important;
+  background: transparent !important;
+  color: #0f172a !important;
+  padding: 0 !important;
+}
+.dark .lc-perf-value code { color: #f8fafc !important; }
+.lc-perf-label {
+  font-size: 11px !important;
+  text-transform: uppercase;
+  letter-spacing: 0.05em;
+  color: #6b7280 !important;
+  margin-top: 4px;
+  font-weight: 500 !important;
+}
+.lc-perf-sub {
+  font-size: 11px !important;
+  color: #9ca3af !important;
+  margin-top: 1px;
+}
+/* Inline code */
+.lc-result code {
+  font-family: "SF Mono", "JetBrains Mono", Menlo, Consolas, monospace !important;
+  font-size: 0.92em !important;
+  color: #0f172a !important;
+  background: rgba(15, 23, 42, 0.06);
+  padding: 1px 5px;
+  border-radius: 4px;
+}
+.dark .lc-result code {
+  color: #e2e8f0 !important;
+  background: rgba(226, 232, 240, 0.08);
+}
+/* Generated command — ALWAYS dark theme regardless of mode */
+.lc-cmd {
+  background: #0b1220 !important;
+  color: #f1f5f9 !important;
+  padding: 16px 18px !important;
+  border-radius: 8px;
+  font-size: 12.5px !important;
+  overflow-x: auto;
+  white-space: pre;
+  border: 1px solid #1e293b !important;
+  margin: 0 !important;
+}
+.lc-cmd code {
+  font-family: "SF Mono", "JetBrains Mono", Menlo, Consolas, monospace !important;
+  background: transparent !important;
+  color: #f1f5f9 !important;
+  padding: 0 !important;
+  font-size: 12.5px !important;
+  border-radius: 0 !important;
+}
+/* Comparison view — side-by-side metrics across GPUs */
+.lc-cmp-wrap {
+  overflow-x: auto;
+  margin: 8px 0 12px 0;
+  border: 1px solid #e5e7eb;
+  border-radius: 10px;
+  background: #ffffff;
+}
+.dark .lc-cmp-wrap { background: #111827; border-color: #374151; }
+.lc-cmp-table {
+  width: 100%;
+  border-collapse: collapse;
+  font-size: 13px !important;
+}
+.lc-cmp-table th,
+.lc-cmp-table td {
+  padding: 10px 12px;
+  text-align: left;
+  border-bottom: 1px solid #f3f4f6;
+}
+.dark .lc-cmp-table th,
+.dark .lc-cmp-table td { border-bottom-color: #1f2937; }
+.lc-cmp-table thead th {
+  font-size: 11px !important;
+  text-transform: uppercase;
+  letter-spacing: 0.05em;
+  color: #6b7280 !important;
+  font-weight: 600 !important;
+  background: #f9fafb;
+}
+.dark .lc-cmp-table thead th { background: #1e293b; color: #9ca3af !important; }
+.lc-cmp-row-label {
+  font-size: 12px !important;
+  color: #6b7280 !important;
+  font-weight: 600 !important;
+  white-space: nowrap;
+}
+.lc-cmp-row-info {
+  font-style: italic;
+  color: #9ca3af !important;
+}
+.dark .lc-cmp-row-info { color: #6b7280 !important; }
+.lc-cmp-tr-info td {
+  color: #6b7280;
+  background: #fafafa;
+}
+.dark .lc-cmp-tr-info td { color: #9ca3af; background: #0f172a; }
+.lc-cmp-gpu {
+  font-family: "SF Mono", "JetBrains Mono", Menlo, monospace !important;
+  font-size: 12px !important;
+}
+.lc-cmp-table tbody tr:last-child td { border-bottom: none; }
+.lc-cmp-winner {
+  background: rgba(22, 163, 74, 0.10) !important;
+  font-weight: 700 !important;
+  color: #15803d !important;
+  position: relative;
+}
+.dark .lc-cmp-winner { background: rgba(74, 222, 128, 0.15) !important; color: #4ade80 !important; }
+.lc-cmp-winner::before {
+  content: "✓ ";
+  font-size: 11px;
+  font-weight: 700;
+  color: #15803d;
+  margin-right: 2px;
+}
+.dark .lc-cmp-winner::before { color: #4ade80; }
+.lc-cmp-summary {
+  margin-top: 12px;
+  padding: 12px 14px;
+  border-radius: 8px;
+  background: #eef2ff;
+  border: 1px solid #c7d2fe;
+  font-size: 13px !important;
+  color: #312e81 !important;
+}
+.dark .lc-cmp-summary {
+  background: #1e1b4b;
+  border-color: #3730a3;
+  color: #e0e7ff !important;
+}
+.lc-cmp-summary strong { color: #4338ca; }
+.dark .lc-cmp-summary strong { color: #a5b4fc; }
+/* Per-GPU detail cards under the table */
+.lc-cmp-details {
+  display: grid;
+  grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+  gap: 12px;
+}
+.lc-cmp-detail {
+  border: 1px solid #e5e7eb;
+  border-radius: 10px;
+  padding: 12px 14px;
+  background: #ffffff;
+}
+.dark .lc-cmp-detail { background: #111827; border-color: #374151; }
+.lc-cmp-detail-gpu {
+  font-family: "SF Mono", "JetBrains Mono", Menlo, monospace !important;
+  font-size: 13px !important;
+  font-weight: 700 !important;
+  color: #0f172a !important;
+  margin-bottom: 6px;
+  padding-bottom: 6px;
+  border-bottom: 1px solid #e5e7eb;
+}
+.dark .lc-cmp-detail-gpu { color: #f8fafc !important; border-bottom-color: #374151; }
+.lc-cmp-detail-row {
+  display: flex;
+  justify-content: space-between;
+  font-size: 12px !important;
+  padding: 3px 0;
+}
+.lc-cmp-detail-row span { color: #6b7280 !important; }
+.lc-cmp-detail-row strong {
+  color: #0f172a !important;
+  font-size: 13px !important;
+}
+.dark .lc-cmp-detail-row strong { color: #f8fafc !important; }
+/* Star-on-GitHub CTA — shown at the bottom of the result, capturing the
+   peak-satisfaction moment. Card-style with indigo accent so it reads as
+   "thanks", not as a banner ad. */
+.lc-star-cta {
+  display: flex;
+  align-items: center;
+  gap: 14px;
+  margin: 28px 0 8px 0;
+  padding: 14px 18px;
+  border: 1px solid #c7d2fe;
+  background: #eef2ff;
+  border-radius: 10px;
+  text-decoration: none !important;
+  color: #312e81 !important;
+  transition: background 0.15s ease, border-color 0.15s ease, transform 0.1s ease;
+}
+.lc-star-cta:hover {
+  background: #e0e7ff;
+  border-color: #a5b4fc;
+}
+.lc-star-cta:active { transform: scale(0.995); }
+.dark .lc-star-cta {
+  background: #1e1b4b;
+  border-color: #3730a3;
+  color: #c7d2fe !important;
+}
+.dark .lc-star-cta:hover { background: #312e81; }
+.lc-star-cta svg { flex: 0 0 auto; color: #4338ca; }
+.dark .lc-star-cta svg { color: #a5b4fc; }
+.lc-star-cta-text { flex: 1 1 auto; min-width: 0; }
+.lc-star-cta-q {
+  font-size: 14px !important;
+  font-weight: 600 !important;
+  line-height: 1.3;
+  color: #312e81 !important;
+}
+.dark .lc-star-cta-q { color: #e0e7ff !important; }
+.lc-star-cta-q-en {
+  font-size: 12px !important;
+  color: #6366f1 !important;
+  margin-top: 2px;
+  line-height: 1.3;
+}
+.dark .lc-star-cta-q-en { color: #a5b4fc !important; }
+.lc-star-cta-action {
+  flex: 0 0 auto;
+  font-size: 13px !important;
+  font-weight: 700 !important;
+  color: #4338ca !important;
+  white-space: nowrap;
+}
+.dark .lc-star-cta-action { color: #c7d2fe !important; }
+@media (max-width: 540px) {
+  .lc-star-cta { flex-wrap: wrap; gap: 10px; }
+  .lc-star-cta-action { flex-basis: 100%; }
+}
+/* Loading + error */
+.lc-loading {
+  display: flex;
+  align-items: center;
+  gap: 14px;
+  padding: 24px;
+  color: #6b7280 !important;
+  font-size: 14px !important;
+}
+.lc-spinner {
+  width: 18px; height: 18px;
+  border: 2px solid #cbd5e1;
+  border-top-color: #4f46e5;
+  border-radius: 50%;
+  animation: lc-spin 0.7s linear infinite;
+  flex: none;
+}
+@keyframes lc-spin { to { transform: rotate(360deg); } }
+.lc-error pre {
+  background: #fef2f2;
+  color: #991b1b !important;
+  padding: 12px 14px;
+  border-radius: 8px;
+  border: 1px solid #fecaca;
+  font-size: 12px !important;
+  white-space: pre-wrap;
+  word-break: break-word;
+  margin: 0;
+}
+.dark .lc-error pre { background: #450a0a; color: #fca5a5 !important; border-color: #7f1d1d; }
+/* Explain trace */
+.lc-explain-entry {
+  margin: 14px 0;
+  padding: 14px 16px;
+  border: 1px solid #e5e7eb;
+  border-left: 3px solid #4f46e5;
+  border-radius: 8px;
+  background: #fafafa;
+}
+.dark .lc-explain-entry { background: #0f172a; border-color: #374151; border-left-color: #818cf8; }
+.lc-explain-heading {
+  font-weight: 700 !important;
+  font-size: 14px !important;
+  margin-bottom: 8px;
+  color: #0f172a !important;
+}
+.dark .lc-explain-heading { color: #f8fafc !important; }
+.lc-explain-formula {
+  margin: 6px 0;
+  font-size: 12.5px !important;
+}
+.lc-explain-formula code {
+  background: rgba(79, 70, 229, 0.08) !important;
+  color: #4338ca !important;
+  padding: 4px 8px !important;
+  border-radius: 4px;
+}
+.dark .lc-explain-formula code { color: #a5b4fc !important; background: rgba(165, 180, 252, 0.12) !important; }
+.lc-explain-inputs, .lc-explain-steps {
+  margin: 6px 0 6px 1.2em;
+  font-size: 12.5px !important;
+  line-height: 1.7;
+}
+.lc-explain-label {
+  font-size: 11px !important;
+  color: #6b7280 !important;
+  font-style: italic;
+}
+.lc-explain-result {
+  margin-top: 8px;
+  padding-top: 8px;
+  border-top: 1px dashed #e5e7eb;
+  font-size: 13px !important;
+  color: #0f172a !important;
+}
+.dark .lc-explain-result { color: #f8fafc !important; border-top-color: #374151; }
+/* LLM review */
+.lc-llm-banner {
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  padding: 8px 12px;
+  background: #f9fafb;
+  border: 1px solid #e5e7eb;
+  border-radius: 8px;
+  font-size: 12px !important;
+  color: #4b5563 !important;
+  margin-bottom: 12px;
+}
+.dark .lc-llm-banner { color: #d1d5db !important; background: #111827; border-color: #374151; }
+.lc-llm-model {
+  font-size: 11px !important;
+  color: #6b7280 !important;
+  font-weight: 500 !important;
+  margin-left: 6px;
+  text-transform: none !important;
+  letter-spacing: 0 !important;
+}
+.lc-llm-content {
+  font-size: 13px !important;
+  line-height: 1.7;
+  color: #0f172a !important;
+  padding: 12px 14px;
+  border: 1px solid #e5e7eb;
+  border-radius: 8px;
+  background: #ffffff;
+}
+.dark .lc-llm-content { color: #f3f4f6 !important; background: #111827; border-color: #374151; }
+"""
+def _build_ui() -> gr.Blocks:
+    with gr.Blocks(title="llm-cal — LLM hardware calculator") as demo:
+        gr.HTML(HERO_HTML)
+        # ---- Required ----------------------------------------------------
+        with gr.Row():
+            model_id = gr.Textbox(
+                label="Model ID · 模型 ID",
+                placeholder="e.g. deepseek-ai/DeepSeek-V4-Flash",
+                info="Repo id · 仓库 ID（owner/name）",
+                scale=3,
+            )
+            source = gr.Radio(
+                choices=["HuggingFace", "ModelScope"],
+                value="HuggingFace",
+                label="Source · 来源",
+                info="Where to pull model metadata · 拉取来源",
+                scale=2,
+            )
+        with gr.Row():
+            vendor = gr.Dropdown(
+                choices=VENDOR_CHOICES_EN,
+                value=DEFAULT_VENDOR,
+                label="GPU vendor · GPU 厂商",
+                info="11 vendors covered · 共 11 家",
+                scale=1,
+            )
+            gpu = gr.Dropdown(
+                choices=_VENDOR_TO_GPUS[DEFAULT_VENDOR],
+                value=[DEFAULT_GPU],
+                label="GPU model · GPU 型号",
+                info="One GPU = single eval. 2-4 = compare side-by-side · 选 1 张单评估，2-4 张对比",
+                scale=2,
+                multiselect=True,
+                max_choices=4,
+                allow_custom_value=True,
+            )
+        with gr.Row():
+            engine = gr.Radio(
+                choices=["vllm", "sglang"],
+                value="vllm",
+                label="Engine · 引擎",
+                info="Inference engine · 推理引擎",
+            )
+            context_length = gr.Number(
+                label="Context length · Context 长度",
+                value=None,
+                precision=0,
+                info="Empty = 4K/32K/128K/1M · 留空显示全档",
+            )
+            lang = gr.Radio(
+                choices=["English", "中文"],
+                value="English",
+                label="Output language · 输出语言",
+                info="Result area only · 仅影响下方结果区",
+            )
+        # ---- Performance tuning (collapsible) ----------------------------
+        with gr.Accordion("Performance tuning · 性能参数", open=False):
+            with gr.Row():
+                input_tokens = gr.Number(
+                    label="Input tokens · 输入 tokens",
+                    value=2000,
+                    precision=0,
+                    info="Prefill budget · Prefill 预算",
+                )
+                output_tokens = gr.Number(
+                    label="Output tokens · 输出 tokens",
+                    value=512,
+                    precision=0,
+                    info="Decode budget · Decode 预算",
+                )
+                target_tps = gr.Number(
+                    label="Target tok/s/user · 单用户目标 tok/s",
+                    value=30.0,
+                    info="SLA per user · 单用户 SLA（30 ≈ 流畅阅读）",
+                )
+            with gr.Row():
+                prefill_util = gr.Number(
+                    label="Prefill util · Prefill 利用率",
+                    value=0.40,
+                    info="0–1 · 0.40 = vLLM paper baseline",
+                )
+                decode_bw_util = gr.Number(
+                    label="Decode BW util · Decode 带宽利用率",
+                    value=0.50,
+                    info="0–1 · 0.50 = community median",
+                )
+                concurrency_degradation = gr.Number(
+                    label="Concurrency degradation · 并发衰减",
+                    value=1.0,
+                    info="1.0 = honest · 1.67 = 60% efficiency under load",
+                )
+        # ---- Advanced (collapsible) --------------------------------------
+        with gr.Accordion("Advanced · 高级", open=False):
+            with gr.Row():
+                hf_token = gr.Textbox(
+                    label="HF_TOKEN",
+                    value="",
+                    placeholder="hf_...",
+                    type="password",
+                    info="For gated HF models · 私有 HF 模型用",
+                )
+                ms_token = gr.Textbox(
+                    label="MODELSCOPE_API_TOKEN",
+                    value="",
+                    placeholder="ms-...",
+                    type="password",
+                    info="For gated MS models · 私有 MS 模型用",
+                )
+            with gr.Row():
+                gpu_count = gr.Number(
+                    label="Force GPU count · 强制 GPU 数",
+                    value=None,
+                    precision=0,
+                    info="Empty = auto min/dev/prod · 留空自动给三档",
+                )
+                refresh = gr.Checkbox(
+                    label="Refresh cache · 刷新缓存",
+                    value=False,
+                    info="Bypass diskcache · 跳过本地缓存",
+                )
+            with gr.Row():
+                explain = gr.Checkbox(
+                    label="--explain · 推导链",
+                    value=False,
+                    info="Full derivation trace · 输出完整推导链",
+                )
+                llm_review = gr.Checkbox(
+                    label="--llm-review · LLM 审计",
+                    value=False,
+                    info="Second opinion from an LLM · 第二意见审计",
+                )
+            with gr.Row():
+                llm_api_key = gr.Textbox(
+                    label="LLM API key · LLM API 密钥",
+                    value="",
+                    placeholder="sk-...",
+                    type="password",
+                    info="OpenAI-compatible endpoint · OpenAI 兼容端点",
+                )
+                llm_base_url = gr.Textbox(
+                    label="LLM base URL · LLM 基地址",
+                    value="",
+                    placeholder="https://api.openai.com/v1",
+                    info="e.g. https://api.deepseek.com/v1",
+                )
+                llm_model = gr.Textbox(
+                    label="LLM model · LLM 模型名",
+                    value="",
+                    placeholder="gpt-4o",
+                    info="e.g. gpt-4o / deepseek-chat / MiniMax-M2",
+                )
+        with gr.Row(elem_classes="lc-submit-wrap"):
+            submit = gr.Button("Calculate · 计算", variant="primary", size="lg")
+        # Three output panes — main always shows, explain/llm-review only when toggled
+        output_main = gr.HTML(label="Result")
+        output_explain = gr.HTML(label="Explain trace")
+        output_llm = gr.HTML(label="LLM review")
+        gr.Examples(
+            examples=[
+                # gpu wrapped in a list — the Dropdown is multiselect now
+                [m, v, [g], e, None, "English", s]
+                for m, v, g, e, s in EXAMPLE_MODELS
+            ],
+            inputs=[model_id, vendor, gpu, engine, context_length, lang, source],
+            label="Try one of these · 试试这些组合",
+        )
+        gr.HTML(
+            "<div class='lc-footer'>"
+            "<a href='https://github.com/FlyTOmeLight/llm-cal' target='_blank'>GitHub</a> · "
+            "<a href='https://flytomelight.github.io/llm-cal/' target='_blank'>Docs</a> · "
+            "<a href='https://flytomelight.github.io/llm-cal/methodology/' target='_blank'>Methodology</a> · "
+            "<code>pip install llm-cal</code>"
+            "</div>"
+        )
+        # When vendor changes, repopulate the GPU dropdown but PRESERVE any
+        # cross-vendor selections (the whole point of compare mode is to
+        # stack e.g. H800 + MI300X + 910B4 across NVIDIA/AMD/Ascend).
+        def _on_vendor_change(v: str, current):  # noqa: ANN001, ANN202
+            gpus = _VENDOR_TO_GPUS.get(v, [])
+            # multiselect returns list; harden against str/None for safety
+            if isinstance(current, list):
+                keep = list(current)
+            elif current:
+                keep = [current]
+            else:
+                keep = []
+            # Empty selection? Seed with the first GPU so the form stays usable.
+            if not keep:
+                keep = [gpus[0]] if gpus else []
+            return gr.Dropdown(choices=gpus, value=keep)
+        vendor.change(fn=_on_vendor_change, inputs=[vendor, gpu], outputs=[gpu])
+        # Click flow: instantly show "loading…", THEN run calculate.
+        all_outputs = [output_main, output_explain, output_llm]
+        submit.click(
+            fn=show_loading,
+            inputs=[lang],
+            outputs=all_outputs,
+        ).then(
+            fn=calculate,
+            inputs=[
+                model_id, gpu, engine, context_length, lang, source,
+                gpu_count, input_tokens, output_tokens, target_tps,
+                prefill_util, decode_bw_util, concurrency_degradation,
+                refresh, explain, llm_review,
+                hf_token, ms_token,
+                llm_api_key, llm_base_url, llm_model,
+            ],
+            outputs=all_outputs,
+        )
+    return demo
+def _prewarm_cache() -> None:
+    """Fill the artifact cache for every Examples row so first-click users
+    don't pay the 3-8s HF/MS metadata roundtrip.
+    Runs on a daemon thread alongside the Gradio server. Failures are
+    swallowed (printed only) — pre-warm is a UX nicety, never a hard
+    dependency. Set LLM_CAL_PREWARM=0 to disable (useful for local dev
+    when you don't want 9 API calls every time you `python web/app.py`).
+    """
+    import time
+    print(f"[prewarm] starting cache warm-up for {len(EXAMPLE_MODELS)} examples")
+    for i, (model_id, _vendor, gpu, engine, source) in enumerate(EXAMPLE_MODELS, 1):
+        src_key = "modelscope" if "modelscope" in source.lower() else "huggingface"
+        label = f"{i}/{len(EXAMPLE_MODELS)} {src_key}:{model_id}"
+        try:
+            t0 = time.monotonic()
+            _get_evaluator(src_key).evaluate(
+                model_id=model_id,
+                gpu=gpu,
+                engine=engine,
+            )
+            print(f"[prewarm] {label} ok ({time.monotonic() - t0:.1f}s)")
+        except Exception as e:  # noqa: BLE001
+            print(f"[prewarm] {label} skip — {type(e).__name__}: {e}")
+        # Throttle to stay well under HF/MS anonymous rate limits.
+        time.sleep(2)
+    print("[prewarm] done")
+if __name__ == "__main__":
+    if os.environ.get("LLM_CAL_PREWARM", "1") == "1":
+        import threading
+        threading.Thread(target=_prewarm_cache, daemon=True).start()
+    _build_ui().launch(theme=THEME, css=CUSTOM_CSS)

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ gradio>=6.0,<7.0
2	+ llm-cal>=0.1.3

src/llm_cal/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""llm-cal — LLM inference hardware calculator."""
+from llm_cal.core.evaluator import Evaluator
+from llm_cal.output.labels import Label
+__all__ = ["Evaluator", "Label"]

src/llm_cal/architecture/__init__.py ADDED Viewed

File without changes

src/llm_cal/architecture/detector.py ADDED Viewed

	@@ -0,0 +1,134 @@

+"""`detect()` — main orchestration over trait sub-detectors.
+Step 1: Family dispatch (state_space vs transformer vs unknown).
+Step 2: Gather traits (independent sub-detectors).
+Step 3: Assemble Profile with a confidence level.
+Fallback path: `_fallback_unknown()` for configs missing key fields. This is
+the bedrock of "works on day-0" — new model types degrade gracefully.
+"""
+from __future__ import annotations
+from typing import Any
+from llm_cal.architecture.profile import (
+    ArchitectureProfile,
+    Confidence,
+    Family,
+)
+from llm_cal.architecture.traits import (
+    detect_attention,
+    detect_moe,
+    detect_position,
+    detect_sliding_window,
+)
+# Model types we know we handle well. Maintained alongside engine_compat matrix.
+KNOWN_MODEL_TYPES: frozenset[str] = frozenset(
+    {
+        "llama",
+        "mistral",
+        "mixtral",
+        "qwen2",
+        "qwen2_moe",
+        "qwen3",
+        "qwen3_moe",
+        "deepseek_v2",
+        "deepseek_v3",
+        "deepseek_v3_2",
+        "deepseek_v4",
+        "gemma",
+        "gemma2",
+        "gemma3",
+        "phi",
+        "phi3",
+    }
+)
+STATE_SPACE_TYPES: frozenset[str] = frozenset({"mamba", "mamba2", "falcon_mamba", "jamba"})
+def detect(config: dict[str, Any]) -> ArchitectureProfile:
+    """Main entry. Given a parsed config.json dict, return an ArchitectureProfile."""
+    model_type = str(config.get("model_type", "")).lower()
+    # Step 1: state_space family short-circuits — v0.1 unsupported, but we identify it
+    if model_type in STATE_SPACE_TYPES or "ssm_cfg" in config:
+        return ArchitectureProfile(
+            model_type=model_type,
+            architectures=tuple(str(a).lower() for a in config.get("architectures", [])),
+            family=Family.STATE_SPACE,
+            num_hidden_layers=int(config.get("num_hidden_layers", 0)),
+            hidden_size=int(config.get("hidden_size", 0)),
+            vocab_size=int(config.get("vocab_size", 0)),
+            confidence=Confidence.HIGH,
+            auxiliary={"v0_1_unsupported": True},
+        )
+    # Step 2: reject if fundamentally unidentifiable
+    if not model_type and not config.get("architectures"):
+        return _fallback_unknown(config)
+    # Step 3: required fields
+    num_layers = config.get("num_hidden_layers")
+    hidden_size = config.get("hidden_size")
+    if not num_layers or not hidden_size:
+        return _fallback_unknown(config)
+    # Step 4: gather traits (each is independent and may return None)
+    attention = detect_attention(config)
+    moe = detect_moe(config)
+    position = detect_position(config)
+    sliding = detect_sliding_window(config)
+    # Step 5: confidence — HIGH iff model_type is in the registry
+    confidence = Confidence.HIGH if model_type in KNOWN_MODEL_TYPES else Confidence.MEDIUM
+    # Pass-through of config fields our formulas can use downstream. Keeps the
+    # Profile schema stable while enabling richer computation (e.g. dense FFN
+    # param count needs intermediate_size).
+    auxiliary: dict[str, object] = {}
+    if isinstance(config.get("intermediate_size"), int):
+        auxiliary["intermediate_size"] = config["intermediate_size"]
+    if config.get("tie_word_embeddings") is not None:
+        auxiliary["tie_word_embeddings"] = bool(config["tie_word_embeddings"])
+    return ArchitectureProfile(
+        model_type=model_type,
+        architectures=tuple(str(a).lower() for a in config.get("architectures", [])),
+        family=Family.TRANSFORMER,
+        num_hidden_layers=int(num_layers),
+        hidden_size=int(hidden_size),
+        vocab_size=int(config.get("vocab_size", 0)),
+        confidence=confidence,
+        attention=attention,
+        moe=moe,
+        position=position,
+        sliding_window=sliding,
+        auxiliary=auxiliary,
+    )
+def _fallback_unknown(config: dict[str, Any]) -> ArchitectureProfile:
+    """Graceful degradation when config.json is unusable.
+    Still returns a valid Profile. Consumers check `family == Family.UNKNOWN`
+    or `confidence == Confidence.LOW` and skip KV-cache estimation accordingly.
+    """
+    return ArchitectureProfile(
+        model_type=str(config.get("model_type", "")).lower(),
+        architectures=tuple(str(a).lower() for a in config.get("architectures", [])),
+        family=Family.UNKNOWN,
+        num_hidden_layers=int(config.get("num_hidden_layers", 0)),
+        hidden_size=int(config.get("hidden_size", 0)),
+        vocab_size=int(config.get("vocab_size", 0)),
+        confidence=Confidence.LOW,
+        auxiliary={
+            "warning": (
+                "No recognizable model_type or missing essential config fields. "
+                "Weight estimate from safetensors file size only; "
+                "KV cache cannot be estimated; engine compatibility unknown."
+            )
+        },
+    )

src/llm_cal/architecture/formulas/__init__.py ADDED Viewed

File without changes

src/llm_cal/architecture/formulas/kv_cache.py ADDED Viewed

	@@ -0,0 +1,145 @@

+"""KV cache estimation — traits-composed formula.
+The formula is NOT owned by a single architecture module. Instead we compose it
+from the traits on `ArchitectureProfile`:
+  baseline = 2 (K+V) * num_kv_heads * head_dim * seq_len * dtype_bytes * num_layers
+Then apply compositional modifiers:
+  * MLA:            baseline uses kv_lora_rank instead of num_kv_heads * head_dim
+                    (DeepSeek's compressed KV representation)
+  * CSA_HCA:        multiply by an effective-ratio derived from compress_ratios
+                    (most layers are heavily compressed, a few are dense)
+  * Sliding window: cap `seq_len` at the window size
+  * NSA:            multiply by (nsa_topk / seq_len), clamped — sparse attention
+                    keeps only top-k keys
+Returns AnnotatedValue tagged [estimated] unless we can't compute it at all.
+"""
+from __future__ import annotations
+from llm_cal.architecture.profile import (
+    ArchitectureProfile,
+    AttentionTraits,
+    Confidence,
+    Family,
+)
+from llm_cal.output.labels import AnnotatedValue, Label
+def compute_kv_cache_bytes(
+    profile: ArchitectureProfile,
+    seq_len: int,
+    dtype_bytes: int = 2,  # BF16/FP16 default
+) -> AnnotatedValue[int]:
+    """KV cache per single request at `seq_len` tokens.
+    Returns AnnotatedValue. The label tells the user whether we could compute it
+    at all.
+    """
+    if seq_len <= 0:
+        return AnnotatedValue(0, Label.ESTIMATED, source="seq_len <= 0")
+    if profile.family == Family.STATE_SPACE:
+        return AnnotatedValue(
+            0,
+            Label.UNKNOWN,
+            source="state-space model has no KV cache concept",
+        )
+    if profile.family == Family.UNKNOWN or profile.confidence == Confidence.LOW:
+        return AnnotatedValue(
+            0,
+            Label.UNKNOWN,
+            source="unknown architecture — cannot estimate KV cache",
+        )
+    if profile.attention is None or profile.num_hidden_layers <= 0:
+        return AnnotatedValue(
+            0,
+            Label.UNKNOWN,
+            source="missing attention traits or layer count",
+        )
+    attn = profile.attention
+    n_layers = profile.num_hidden_layers
+    # Step 1: effective seq_len.
+    # Sliding window applies ONLY to standard attention (MHA/GQA/MQA). For
+    # explicitly-sparse variants (CSA_HCA, NSA), the sparse mechanism already
+    # encodes per-layer reduction; stacking sliding cap would double-count and
+    # produce absurdly small estimates (measured 1000x too low on DeepSeek-V4).
+    effective_seq = seq_len
+    sliding_note = ""
+    is_sparse_variant = attn.variant in ("CSA_HCA", "NSA")
+    if profile.sliding_window and profile.sliding_window > 0 and not is_sparse_variant:
+        effective_seq = min(seq_len, profile.sliding_window)
+        if effective_seq < seq_len:
+            sliding_note = (
+                f" (sliding_window={profile.sliding_window} caps {seq_len} -> {effective_seq})"
+            )
+    # Step 2: per-layer per-token cache size
+    per_layer_per_token = _per_layer_per_token_bytes(attn, dtype_bytes)
+    # Step 3: baseline for the full layer stack
+    baseline = per_layer_per_token * effective_seq * n_layers
+    # Step 4: compositional modifier for sparse attention
+    result_bytes = baseline
+    variant_note: str = str(attn.variant)
+    if attn.variant == "CSA_HCA" and attn.compress_ratios:
+        ratio = _average_csa_hca_ratio(attn.compress_ratios)
+        result_bytes = int(baseline * ratio)
+        variant_note = f"{variant_note} (avg compress ratio {ratio:.3f})"
+    if attn.variant == "NSA" and attn.nsa_topk and attn.nsa_topk > 0:
+        sparsity = min(1.0, attn.nsa_topk / effective_seq)
+        result_bytes = int(baseline * sparsity)
+        variant_note = f"{variant_note} (nsa_topk={attn.nsa_topk}, sparsity={sparsity:.3f})"
+    return AnnotatedValue(
+        result_bytes,
+        Label.ESTIMATED,
+        source=(
+            f"{variant_note}: 2*kv_shape*{dtype_bytes}B*{effective_seq}*{n_layers}{sliding_note}"
+        ),
+    )
+def _per_layer_per_token_bytes(attn: AttentionTraits, dtype_bytes: int) -> int:
+    """Bytes of K+V storage per token per layer, given attention shape."""
+    # MLA: KV is compressed into a single latent vector of size kv_lora_rank.
+    # (Both K and V share it; it's NOT 2 * kv_lora_rank.)
+    if attn.variant == "MLA" and attn.kv_lora_rank:
+        return attn.kv_lora_rank * dtype_bytes
+    # Standard / GQA / MQA / CSA+HCA (the sparse scaling is applied later).
+    # K and V both stored: factor of 2.
+    return 2 * attn.num_kv_heads * attn.head_dim * dtype_bytes
+def _average_csa_hca_ratio(compress_ratios: tuple[int, ...]) -> float:
+    """DeepSeek V4 compress_ratios semantics:
+      0   -> dense attention (keep 100%)
+      N>0 -> keep 1/N of tokens
+    Returns the average "keep fraction" across all layers.
+    Example: ratios = [0, 0, 4, 128, 4, 128, ...]
+      - two dense layers (fraction = 1.0)
+      - remaining alternating 1/4 and 1/128
+      - weighted average across all layers
+    """
+    if not compress_ratios:
+        return 1.0
+    total_fraction = 0.0
+    for r in compress_ratios:
+        if r == 0:
+            total_fraction += 1.0
+        else:
+            total_fraction += 1.0 / r
+    return total_fraction / len(compress_ratios)

src/llm_cal/architecture/formulas/weight.py ADDED Viewed

	@@ -0,0 +1,133 @@

+"""Weight count estimation — total parameters and total bytes by assumption.
+Two distinct purposes, kept separate by label:
+  * estimate_total_params(profile) -> [estimated] param count
+  * predicted_bytes_under_quant(params, scheme) -> [estimated] bytes
+The weight_analyzer/reconciler compares predicted_bytes against observed file
+sizes to identify the actual quantization scheme. That's the DeepSeek-V4 story.
+"""
+from __future__ import annotations
+from llm_cal.architecture.profile import ArchitectureProfile
+from llm_cal.output.labels import AnnotatedValue, Label
+from llm_cal.weight_analyzer import _QUANT_BPP, QuantizationScheme
+def estimate_total_params(profile: ArchitectureProfile) -> AnnotatedValue[int]:
+    """Rough param count from Profile.
+    Core components (transformer block):
+      - Embedding: vocab_size * hidden_size (+ output head if not tied)
+      - Per-layer attention: depends on variant
+      - Per-layer FFN: depends on dense vs MoE
+    Returns [estimated] — this is an arithmetic sum over config values, several
+    simplifying assumptions (e.g. RMSNorm gamma counted in overhead).
+    """
+    if profile.num_hidden_layers <= 0 or profile.hidden_size <= 0:
+        return AnnotatedValue(0, Label.UNKNOWN, source="insufficient shape info in profile")
+    hidden = profile.hidden_size
+    n_layers = profile.num_hidden_layers
+    vocab = profile.vocab_size
+    # Embedding + output head. When weights are tied (Gemma, some Llamas),
+    # the output head IS the embedding — don't count twice.
+    embed_params = vocab * hidden
+    tied = bool(profile.auxiliary.get("tie_word_embeddings", False))
+    output_head_params = 0 if tied else vocab * hidden
+    # Per-layer attention projections.
+    attn_params = _attention_params(profile)
+    # Per-layer FFN (dense path) OR MoE expert block.
+    ffn_params = _ffn_params(profile)
+    # Per-layer LayerNorms (2 of them, one scalar per feature).
+    norm_params = 2 * hidden
+    per_layer = attn_params + ffn_params + norm_params
+    total = embed_params + output_head_params + per_layer * n_layers
+    return AnnotatedValue(
+        total,
+        Label.ESTIMATED,
+        source=(
+            f"{vocab} vocab * {hidden} hidden * 2 (embed+head) + "
+            f"{n_layers} layers * ({attn_params:,} attn + {ffn_params:,} ffn + norms)"
+        ),
+    )
+def _attention_params(profile: ArchitectureProfile) -> int:
+    """Parameter count for attention projections (Q/K/V/O) in one layer."""
+    attn = profile.attention
+    if attn is None:
+        return 0
+    hidden = profile.hidden_size
+    # MLA uses low-rank projections — very different shape.
+    if attn.variant == "MLA" and attn.q_lora_rank:
+        q_lora = attn.q_lora_rank
+        kv_lora = attn.kv_lora_rank or attn.q_lora_rank  # approximate
+        # W_q_down + W_q_up + W_kv_down + W_kv_up + W_o_down + W_o_up
+        head_total = attn.num_heads * attn.head_dim
+        return (
+            hidden * q_lora  # Q down
+            + q_lora * head_total  # Q up
+            + hidden * kv_lora * 2  # K+V down (shared)
+            + kv_lora * head_total  # K+V up
+            + head_total * q_lora  # O down (reuse q_lora as o_lora approx)
+            + q_lora * hidden  # O up
+        )
+    # Standard/GQA/MQA: Q + K + V + O projections
+    q_out = attn.num_heads * attn.head_dim
+    kv_out = attn.num_kv_heads * attn.head_dim
+    return hidden * q_out + hidden * kv_out * 2 + q_out * hidden
+def _ffn_params(profile: ArchitectureProfile) -> int:
+    """Parameter count for the FFN (MoE or dense) in one layer.
+    For MoE, counts all experts (routed + shared) because they all live in memory.
+    Active parameters per token is a different metric (not our job here).
+    """
+    hidden = profile.hidden_size
+    if profile.moe is not None:
+        moe = profile.moe
+        # SwiGLU-style expert: 3 matrices (gate, up, down), each hidden x moe_intermediate.
+        single_expert = 3 * hidden * moe.moe_intermediate_size
+        total_experts = moe.num_routed_experts + moe.num_shared_experts
+        # Router: hidden x num_routed_experts
+        router = hidden * moe.num_routed_experts
+        return single_expert * total_experts + router
+    # Dense: try to read intermediate_size from auxiliary; fallback to 4 * hidden.
+    intermediate = profile.auxiliary.get("intermediate_size")
+    if not isinstance(intermediate, int) or intermediate <= 0:
+        intermediate = 4 * hidden
+    # SwiGLU: 3 matrices
+    return 3 * hidden * intermediate
+def predicted_bytes_under_quant(
+    total_params: int, scheme: QuantizationScheme
+) -> AnnotatedValue[int]:
+    """How many bytes `total_params` would occupy under a given quantization."""
+    bpp = _QUANT_BPP.get(scheme, 0.0)
+    if bpp == 0.0:
+        return AnnotatedValue(
+            0,
+            Label.UNKNOWN,
+            source=f"no bytes-per-param mapping for {scheme}",
+        )
+    predicted = int(total_params * bpp)
+    return AnnotatedValue(
+        predicted,
+        Label.ESTIMATED,
+        source=f"{total_params:,} params * {bpp} bytes/param ({scheme})",
+    )

src/llm_cal/architecture/profile.py ADDED Viewed

	@@ -0,0 +1,97 @@

+"""ArchitectureProfile — the core data class the whole tool orbits.
+Key insight: an architecture is NOT a single label. It's a combination of independent
+traits that co-exist on a Profile. DeepSeek-V3.2 = MoE + MLA + NSA — three traits.
+Single-module dispatch cannot express this; traits composition can.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from enum import StrEnum
+from typing import Literal
+class Family(StrEnum):
+    TRANSFORMER = "transformer"
+    STATE_SPACE = "state_space"  # Mamba, etc. — v0.1 unsupported
+    UNKNOWN = "unknown"
+class Confidence(StrEnum):
+    HIGH = "high"  # model_type in KNOWN_MODEL_TYPES, all fields present
+    MEDIUM = "medium"  # model_type unknown but architectures[] or config partial
+    LOW = "low"  # fallback path, config.json missing or malformed
+AttentionVariant = Literal["MHA", "GQA", "MQA", "MLA", "NSA", "CSA_HCA"]
+@dataclass(frozen=True)
+class AttentionTraits:
+    """Attention layer shape. Populated by `detect_attention()`."""
+    variant: AttentionVariant
+    num_heads: int
+    num_kv_heads: int
+    head_dim: int
+    # MLA-specific (DeepSeek V2+)
+    q_lora_rank: int | None = None
+    kv_lora_rank: int | None = None
+    # Sparse attention (CSA+HCA per DeepSeek V4)
+    compress_ratios: tuple[int, ...] | None = None
+    # Sparse attention (NSA per DeepSeek V3.2)
+    nsa_topk: int | None = None
+@dataclass(frozen=True)
+class MoETraits:
+    """MoE-specific layer shape. None on Profile means dense."""
+    num_routed_experts: int
+    num_shared_experts: int
+    num_experts_per_tok: int
+    moe_intermediate_size: int
+@dataclass(frozen=True)
+class PositionTraits:
+    """RoPE / YaRN / AliBi / none."""
+    rope_type: Literal["rope", "yarn", "alibi", "none"] = "rope"
+    rope_theta: float | None = None
+    rope_scaling_factor: float | None = None
+    max_position_embeddings: int | None = None
+@dataclass(frozen=True)
+class ArchitectureProfile:
+    """Complete architectural snapshot of a model.
+    This drives weight/KV-cache formulas, engine matching, and fleet planning.
+    """
+    model_type: str  # config.json's `model_type` (lowercase)
+    architectures: tuple[str, ...]  # config.json's `architectures[]`
+    family: Family
+    num_hidden_layers: int
+    hidden_size: int
+    vocab_size: int
+    confidence: Confidence
+    # Traits (composable — not all populated)
+    attention: AttentionTraits | None = None
+    moe: MoETraits | None = None
+    position: PositionTraits | None = None
+    sliding_window: int | None = None  # None = no window
+    # Pass-through for traits we haven't categorised yet
+    auxiliary: dict[str, object] = field(default_factory=dict)
+    @property
+    def is_moe(self) -> bool:
+        return self.moe is not None
+    @property
+    def is_sparse_attention(self) -> bool:
+        if self.attention is None:
+            return False
+        return self.attention.variant in ("NSA", "CSA_HCA")

src/llm_cal/architecture/traits.py ADDED Viewed

	@@ -0,0 +1,150 @@

+"""Independent trait sub-detectors.
+Each function inspects config.json and returns a trait dataclass (or None).
+They co-exist: a MoE+MLA+CSA_HCA model matches all three.
+Dispatch order inside `detect_attention()` is critical because some keys are
+ambiguous (e.g. num_kv_heads < num_heads can be GQA OR a side-effect of MLA
+where there's a single compressed KV head).
+"""
+from __future__ import annotations
+from typing import Any
+from llm_cal.architecture.profile import (
+    AttentionTraits,
+    MoETraits,
+    PositionTraits,
+)
+def detect_moe(config: dict[str, Any]) -> MoETraits | None:
+    """MoE detection — presence of any routed-expert key signals MoE."""
+    routed = (
+        config.get("n_routed_experts")
+        or config.get("num_local_experts")
+        or config.get("num_experts")
+    )
+    if not routed:
+        return None
+    return MoETraits(
+        num_routed_experts=int(routed),
+        num_shared_experts=int(config.get("n_shared_experts", 0)),
+        num_experts_per_tok=int(
+            config.get("num_experts_per_tok") or config.get("num_experts_per_token", 1)
+        ),
+        moe_intermediate_size=int(
+            config.get("moe_intermediate_size") or config.get("intermediate_size", 0)
+        ),
+    )
+def detect_attention(config: dict[str, Any]) -> AttentionTraits:
+    """Attention variant detection — order-sensitive.
+    Priority (first match wins on variant, but shape fields always populated):
+      1. CSA+HCA: compress_ratios array, length matches num_hidden_layers
+      2. NSA: nsa_config / sparse_attention_cfg present
+      3. MLA: q_lora_rank OR kv_lora_rank present
+      4. GQA/MQA: num_kv_heads < num_heads
+      5. MHA: default
+    """
+    num_heads = int(config.get("num_attention_heads", 1))
+    num_kv_heads = int(config.get("num_key_value_heads", num_heads))
+    head_dim = int(config.get("head_dim") or (config.get("hidden_size", 0) // num_heads or 1))
+    num_layers = int(config.get("num_hidden_layers", 0))
+    q_lora = config.get("q_lora_rank")
+    kv_lora = config.get("kv_lora_rank")
+    compress_ratios = config.get("compress_ratios")
+    has_nsa = "nsa_config" in config or "sparse_attention_cfg" in config
+    # CSA+HCA: length check guards against future variants that happen to use the
+    # same key name with different semantics. Reviewer flagged this.
+    # Accepted lengths:
+    #   - num_hidden_layers
+    #   - num_hidden_layers + num_nextn_predict_layers (DeepSeek MTP: one extra
+    #     ratio for the next-token prediction head)
+    nextn = int(config.get("num_nextn_predict_layers", 0))
+    accepted_lengths = {num_layers, num_layers + nextn} if num_layers > 0 else set()
+    if (
+        isinstance(compress_ratios, list)
+        and num_layers > 0
+        and len(compress_ratios) in accepted_lengths
+    ):
+        return AttentionTraits(
+            variant="CSA_HCA",
+            num_heads=num_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim=head_dim,
+            q_lora_rank=int(q_lora) if q_lora else None,
+            kv_lora_rank=int(kv_lora) if kv_lora else None,
+            compress_ratios=tuple(compress_ratios),
+        )
+    if has_nsa:
+        nsa_cfg = config.get("nsa_config") or config.get("sparse_attention_cfg", {})
+        nsa_topk = None
+        if isinstance(nsa_cfg, dict):
+            nsa_topk = nsa_cfg.get("topk") or nsa_cfg.get("index_topk")
+        return AttentionTraits(
+            variant="NSA",
+            num_heads=num_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim=head_dim,
+            nsa_topk=int(nsa_topk) if nsa_topk else None,
+        )
+    if q_lora or kv_lora:
+        return AttentionTraits(
+            variant="MLA",
+            num_heads=num_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim=head_dim,
+            q_lora_rank=int(q_lora) if q_lora else None,
+            kv_lora_rank=int(kv_lora) if kv_lora else None,
+        )
+    if num_kv_heads < num_heads:
+        variant = "MQA" if num_kv_heads == 1 else "GQA"
+        return AttentionTraits(
+            variant=variant,  # type: ignore[arg-type]
+            num_heads=num_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim=head_dim,
+        )
+    return AttentionTraits(
+        variant="MHA",
+        num_heads=num_heads,
+        num_kv_heads=num_kv_heads,
+        head_dim=head_dim,
+    )
+def detect_position(config: dict[str, Any]) -> PositionTraits:
+    rope_scaling = config.get("rope_scaling") or {}
+    rope_type = (rope_scaling.get("type") or rope_scaling.get("rope_type") or "rope").lower()
+    if rope_type not in ("rope", "yarn", "alibi", "none"):
+        rope_type = "rope"
+    return PositionTraits(
+        rope_type=rope_type,  # type: ignore[arg-type]
+        rope_theta=float(config["rope_theta"]) if config.get("rope_theta") else None,
+        rope_scaling_factor=(float(rope_scaling["factor"]) if rope_scaling.get("factor") else None),
+        max_position_embeddings=(
+            int(config["max_position_embeddings"])
+            if config.get("max_position_embeddings")
+            else None
+        ),
+    )
+def detect_sliding_window(config: dict[str, Any]) -> int | None:
+    """Return window size if sliding-window attention is used, else None."""
+    sw = config.get("sliding_window")
+    if sw is None or sw == 0:
+        return None
+    return int(sw)

src/llm_cal/benchmark/__init__.py ADDED Viewed

File without changes

src/llm_cal/benchmark/dataset.yaml ADDED Viewed

	@@ -0,0 +1,203 @@

+# Reference benchmark dataset — curated anchor points for validating llm-cal
+# output against publicly-known values.
+#
+# This is NOT a synthetic benchmark. Each entry cites where the expected
+# values came from — HF API, model card text, vLLM/SGLang recipe, or
+# hand computation in the design doc. If you add an entry, cite sources.
+#
+# The runner (`llm-cal benchmark`) fetches each model's live config and
+# compares the tool's output against these expectations. Failures mean
+# either the tool drifted or the reference data is stale.
+schema_version: 1
+entries:
+  # ------------------------------------------------------------
+  # Signature case — DeepSeek-V4-Flash. Every claim here is the
+  # reason this tool exists (vs gpu_poor's naive FP8 assumption).
+  # ------------------------------------------------------------
+  - name: "DeepSeek-V4-Flash on 8x H800 (tool's reference case)"
+    model_id: deepseek-ai/DeepSeek-V4-Flash
+    gpu: H800
+    engine: vllm
+    expectations:
+      - field: attention_variant
+        expected: CSA_HCA
+        source: "config.json compress_ratios length=44 matches n_layers+n_mtp"
+      - field: quantization
+        expected: FP4_FP8_MIXED
+        source: "HF model card: 'FP4 + FP8 Mixed: MoE experts FP4, others FP8'"
+      - field: weight_bytes
+        expected_min: 158_000_000_000
+        expected_max: 162_000_000_000
+        source: "HF siblings API (46x ~3.57 GB safetensors shards ≈ 160 GB)"
+      - field: fleet_prod_gpus
+        expected: 8
+        source: "Design doc hand computation: 8x H800 for prod-scale concurrency"
+      - field: is_moe
+        expected: true
+        source: "config.json n_routed_experts=256"
+  # ------------------------------------------------------------
+  # Dense GQA — Qwen2.5-72B. Validates:
+  #  - dense (no MoE) detection
+  #  - BF16/FP16 quantization path
+  #  - GQA KV sharding math (critical for Llama-family models)
+  # ------------------------------------------------------------
+  - name: "Qwen2.5-72B on 8x H100 (GQA reference)"
+    model_id: Qwen/Qwen2.5-72B-Instruct
+    gpu: H100
+    engine: vllm
+    expectations:
+      - field: attention_variant
+        expected: GQA
+        source: "config.json num_kv_heads=8 < num_attention_heads=64"
+      - field: quantization
+        expected: FP16
+        source: "config.json torch_dtype=bfloat16, no quantization_config"
+      - field: weight_bytes
+        expected_min: 140_000_000_000
+        expected_max: 150_000_000_000
+        source: "HF siblings API — 72.7B params × 2 bytes ≈ 145 GB"
+      - field: is_moe
+        expected: false
+        source: "config.json has no n_routed_experts / num_local_experts"
+      - field: fleet_prod_gpus_at_most
+        expected: 8
+        source: "Weights fit on 8x H100 (145 GB / 8 ≈ 18 GB per GPU)"
+  # ------------------------------------------------------------
+  # DeepSeek-V3 (classic MoE + MLA, not V3.2's NSA) — validates MLA detection
+  # ------------------------------------------------------------
+  - name: "DeepSeek-V3 on H800 (MoE+MLA, no sparse attention)"
+    model_id: deepseek-ai/DeepSeek-V3
+    gpu: H800
+    engine: vllm
+    expectations:
+      - field: attention_variant
+        expected: MLA
+        source: "config.json q_lora_rank=1536, no compress_ratios or nsa_config"
+      - field: is_moe
+        expected: true
+        source: "config.json n_routed_experts=256"
+      - field: quantization
+        expected: FP8
+        source: "config.json quantization_config.quant_method=fp8"
+      - field: weight_bytes
+        expected_min: 680_000_000_000
+        expected_max: 700_000_000_000
+        source: "HF siblings API — 671B params × 1 byte (FP8) ≈ 670 GB"
+  # ------------------------------------------------------------
+  # Mixtral 8x7B — dense-MoE variant, non-MLA
+  # ------------------------------------------------------------
+  - name: "Mixtral 8x7B on 4x H100 (standard MoE, no MLA)"
+    model_id: mistralai/Mixtral-8x7B-v0.1
+    gpu: H100
+    engine: vllm
+    expectations:
+      - field: attention_variant
+        expected: GQA
+        source: "config.json num_kv_heads=8 < num_attention_heads=32"
+      - field: is_moe
+        expected: true
+        source: "config.json num_local_experts=8"
+      - field: quantization
+        expected: FP16
+        source: "config.json torch_dtype=bfloat16, no quantization_config"
+      - field: weight_bytes
+        expected_min: 90_000_000_000
+        expected_max: 100_000_000_000
+        source: "HF siblings API — 46.7B total params × 2 bytes ≈ 93 GB"
+  # ------------------------------------------------------------
+  # DeepSeek-V3.2 — MLA structurally (NSA at runtime). Validates:
+  #  - model_type=deepseek_v32 is recognized
+  #  - FP8 quantization (inherited from V3)
+  #  - Tool honestly reports MLA because config.json exposes only MLA
+  #    keys; runtime NSA behavior is NOT in config. Future detection
+  #    improvement could override based on model_type.
+  # ------------------------------------------------------------
+  - name: "DeepSeek-V3.2 on H800 (MLA config; NSA runtime)"
+    model_id: deepseek-ai/DeepSeek-V3.2
+    gpu: H800
+    engine: vllm
+    expectations:
+      - field: attention_variant
+        expected: MLA
+        source: >-
+          config.json q_lora_rank=1536, no nsa_config key — detector
+          correctly reports MLA. V3.2's NSA sparse behavior is a runtime
+          feature selected by vllm --attention-backend nsa, NOT encoded
+          in config.json keys. TODO: detector could upgrade to NSA when
+          model_type matches known NSA models.
+      - field: is_moe
+        expected: true
+        source: "config.json n_routed_experts=256"
+      - field: quantization
+        expected: FP8
+        source: "config.json quantization_config.quant_method=fp8"
+  # ------------------------------------------------------------
+  # Qwen3-30B-A3B — validates qwen3_moe model_type + GQA+MoE combo
+  # ------------------------------------------------------------
+  - name: "Qwen3-30B-A3B on H100 (Qwen3 MoE, GQA)"
+    model_id: Qwen/Qwen3-30B-A3B
+    gpu: H100
+    engine: vllm
+    expectations:
+      - field: attention_variant
+        expected: GQA
+        source: "config.json num_kv_heads=4 < num_attention_heads=32"
+      - field: is_moe
+        expected: true
+        source: "config.json num_local_experts or similar MoE key present"
+      - field: quantization
+        expected: FP16
+        source: "config.json torch_dtype=bfloat16"
+      - field: weight_bytes
+        expected_min: 58_000_000_000
+        expected_max: 65_000_000_000
+        source: "HF siblings API — 30.5B total params × 2 bytes ≈ 61 GB"
+  # ------------------------------------------------------------
+  # Qwen2.5-7B — small-model sanity + qwen2 model_type
+  # ------------------------------------------------------------
+  - name: "Qwen2.5-7B on H100 (small dense, sanity)"
+    model_id: Qwen/Qwen2.5-7B-Instruct
+    gpu: H100
+    engine: vllm
+    expectations:
+      - field: attention_variant
+        expected: GQA
+        source: "config.json num_kv_heads=4 < num_attention_heads=28"
+      - field: is_moe
+        expected: false
+        source: "config.json has no MoE keys"
+      - field: quantization
+        expected: FP16
+        source: "config.json torch_dtype=bfloat16"
+      - field: weight_bytes
+        expected_min: 14_000_000_000
+        expected_max: 16_000_000_000
+        source: "HF siblings API — 7.6B params × 2 bytes ≈ 15.2 GB"
+  # ------------------------------------------------------------
+  # Phi-4 — validates phi3 model_type + dense 14B
+  # ------------------------------------------------------------
+  - name: "Phi-4 on L40S (phi3 architecture, 14B dense)"
+    model_id: microsoft/Phi-4
+    gpu: L40S
+    engine: vllm
+    expectations:
+      - field: attention_variant
+        expected: GQA
+        source: "config.json num_kv_heads=10 < num_attention_heads=40"
+      - field: is_moe
+        expected: false
+        source: "config.json has no MoE keys"
+      - field: quantization
+        expected: FP16
+        source: "config.json torch_dtype=bfloat16"
+      - field: weight_bytes
+        expected_min: 28_000_000_000
+        expected_max: 31_000_000_000
+        source: "HF siblings API — 14.7B params × 2 bytes ≈ 29.3 GB"

src/llm_cal/benchmark/runner.py ADDED Viewed

	@@ -0,0 +1,232 @@

+"""Benchmark runner — validate llm-cal's output against curated references.
+For each entry in dataset.yaml, run the evaluator against the model, then
+compare each `expectations[]` field with the predicted value. Report a
+table of pass/fail per check, plus a summary.
+This is NOT a synthetic benchmark. Every expected value cites a source
+(HF API, model card text, vLLM recipe, hand computation) so users can
+audit.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from functools import lru_cache
+from importlib.resources import files
+from pathlib import Path
+from typing import Literal
+from pydantic import BaseModel, Field
+from rich.console import Console
+from rich.table import Table
+from llm_cal.common.yaml_loader import load_yaml
+from llm_cal.core.evaluator import EvaluationReport, Evaluator
+Status = Literal["PASS", "FAIL", "SKIP"]
+class Expectation(BaseModel):
+    field: str
+    # Exactly one of these is used depending on `field`
+    expected: str | int | bool | None = None
+    expected_min: int | None = None
+    expected_max: int | None = None
+    source: str
+class BenchmarkEntry(BaseModel):
+    name: str
+    model_id: str
+    gpu: str
+    engine: str = "vllm"
+    expectations: list[Expectation] = Field(default_factory=list)
+class BenchmarkDataset(BaseModel):
+    schema_version: int
+    entries: list[BenchmarkEntry]
+@dataclass(frozen=True)
+class CheckResult:
+    entry_name: str
+    field: str
+    status: Status
+    predicted: str
+    expected: str
+    source: str
+    note: str | None = None
+def _default_dataset_path() -> Path:
+    return Path(str(files("llm_cal.benchmark").joinpath("dataset.yaml")))
+@lru_cache(maxsize=1)
+def load_dataset(path: Path | None = None) -> BenchmarkDataset:
+    return load_yaml(path or _default_dataset_path(), BenchmarkDataset)
+def run_all(
+    evaluator: Evaluator | None = None,
+    dataset: BenchmarkDataset | None = None,
+) -> list[CheckResult]:
+    """Run every check in the dataset. Returns flat list of results."""
+    evaluator = evaluator or Evaluator()
+    dataset = dataset or load_dataset()
+    results: list[CheckResult] = []
+    for entry in dataset.entries:
+        try:
+            report = evaluator.evaluate(
+                model_id=entry.model_id,
+                gpu=entry.gpu,
+                engine=entry.engine,
+            )
+        except Exception as e:
+            for exp in entry.expectations:
+                results.append(
+                    CheckResult(
+                        entry_name=entry.name,
+                        field=exp.field,
+                        status="SKIP",
+                        predicted="(evaluation failed)",
+                        expected=_fmt_expected(exp),
+                        source=exp.source,
+                        note=f"{type(e).__name__}: {e}",
+                    )
+                )
+            continue
+        for exp in entry.expectations:
+            results.append(_check_one(entry.name, report, exp))
+    return results
+def _check_one(entry_name: str, report: EvaluationReport, exp: Expectation) -> CheckResult:
+    predicted_str, status = _evaluate_field(report, exp)
+    return CheckResult(
+        entry_name=entry_name,
+        field=exp.field,
+        status=status,
+        predicted=predicted_str,
+        expected=_fmt_expected(exp),
+        source=exp.source,
+    )
+def _evaluate_field(report: EvaluationReport, exp: Expectation) -> tuple[str, Status]:
+    """Return (predicted_str, PASS/FAIL/SKIP) for this field.
+    Each `field` name matches a documented check in dataset.yaml.
+    """
+    if exp.field == "attention_variant":
+        attn_actual = report.profile.attention.variant if report.profile.attention else "(none)"
+        return attn_actual, ("PASS" if attn_actual == exp.expected else "FAIL")
+    if exp.field == "quantization":
+        quant_actual = report.weight.quantization_guess.value
+        return quant_actual, ("PASS" if quant_actual == exp.expected else "FAIL")
+    if exp.field == "is_moe":
+        actual_bool = report.profile.is_moe
+        return str(actual_bool), ("PASS" if actual_bool == exp.expected else "FAIL")
+    if exp.field == "weight_bytes":
+        actual_int = report.weight.total_bytes.value
+        low = exp.expected_min or 0
+        high = exp.expected_max or (1 << 62)
+        passed = low <= actual_int <= high
+        return f"{actual_int:,}", ("PASS" if passed else "FAIL")
+    if exp.field == "fleet_prod_gpus":
+        if report.fleet is None:
+            return "(no fleet)", "SKIP"
+        prod = next((o for o in report.fleet.options if o.tier == "prod"), None)
+        if prod is None:
+            return "(no prod tier)", "SKIP"
+        passed = prod.gpu_count == exp.expected
+        return str(prod.gpu_count), ("PASS" if passed else "FAIL")
+    if exp.field == "fleet_prod_gpus_at_most":
+        if report.fleet is None:
+            return "(no fleet)", "SKIP"
+        prod = next((o for o in report.fleet.options if o.tier == "prod"), None)
+        if prod is None:
+            return "(no prod tier)", "SKIP"
+        passed = prod.gpu_count <= int(exp.expected or 0)
+        return f"{prod.gpu_count} (max {exp.expected})", ("PASS" if passed else "FAIL")
+    return "(unknown field)", "SKIP"
+def _fmt_expected(exp: Expectation) -> str:
+    if exp.expected is not None:
+        return str(exp.expected)
+    if exp.expected_min is not None or exp.expected_max is not None:
+        lo = f"{exp.expected_min:,}" if exp.expected_min is not None else "-∞"
+        hi = f"{exp.expected_max:,}" if exp.expected_max is not None else "+∞"
+        return f"[{lo}, {hi}]"
+    return "(unspecified)"
+def render_results(results: list[CheckResult], console: Console | None = None) -> None:
+    console = console or Console()
+    table = Table(
+        title="Benchmark results",
+        title_justify="left",
+        show_header=True,
+        header_style="dim",
+        box=None,
+        padding=(0, 2),
+    )
+    table.add_column("entry")
+    table.add_column("field")
+    table.add_column("predicted")
+    table.add_column("expected")
+    table.add_column("status")
+    status_styles = {
+        "PASS": "bold green",
+        "FAIL": "bold red",
+        "SKIP": "dim yellow",
+    }
+    current_entry = None
+    for r in results:
+        entry_cell = r.entry_name if r.entry_name != current_entry else ""
+        current_entry = r.entry_name
+        table.add_row(
+            entry_cell,
+            r.field,
+            r.predicted,
+            r.expected,
+            f"[{status_styles[r.status]}]{r.status}[/]",
+        )
+    console.print(table)
+    total = len(results)
+    passed = sum(1 for r in results if r.status == "PASS")
+    failed = sum(1 for r in results if r.status == "FAIL")
+    skipped = sum(1 for r in results if r.status == "SKIP")
+    summary = (
+        f"Total: {total}   "
+        f"[bold green]PASS: {passed}[/]   "
+        f"[bold red]FAIL: {failed}[/]   "
+        f"[dim yellow]SKIP: {skipped}[/]"
+    )
+    console.print(summary)
+    if failed > 0:
+        console.print(
+            "[dim]Failures show the tool's prediction diverges from a curated "
+            "source. Check the `source` column for the expected-value provenance.[/]"
+        )
+def exit_code_from(results: list[CheckResult]) -> int:
+    """0 if all PASS or only SKIP; 1 if any FAIL."""
+    return 1 if any(r.status == "FAIL" for r in results) else 0

src/llm_cal/cli.py ADDED Viewed

	@@ -0,0 +1,207 @@

+"""CLI entry point. Thin shell over `Evaluator` + rich formatter."""
+from __future__ import annotations
+import sys
+import typer
+from rich.console import Console
+from llm_cal.benchmark.runner import exit_code_from, render_results, run_all
+from llm_cal.common.i18n import detect_locale_from_env, get_locale, set_locale, t
+from llm_cal.core.evaluator import Evaluator
+from llm_cal.core.explain import build as build_explain
+from llm_cal.hardware.loader import load_database
+from llm_cal.llm_review.reviewer import run_review
+from llm_cal.model_source.base import (
+    AuthRequiredError,
+    ModelNotFoundError,
+    ModelSource,
+    SourceUnavailableError,
+)
+from llm_cal.model_source.huggingface import HuggingFaceSource
+from llm_cal.model_source.modelscope import ModelScopeSource
+from llm_cal.output.formatter import (
+    render,
+    render_explain,
+    render_gpu_list,
+    render_llm_review,
+)
+# Set locale from env first; --lang flag can override inside main()
+set_locale(detect_locale_from_env())
+app = typer.Typer(
+    name="llm-cal",
+    help="LLM inference hardware calculator.",
+    no_args_is_help=True,
+)
+_console = Console()
+_err = Console(stderr=True)
+@app.command()
+def main(
+    model_id: str | None = typer.Argument(None, help="HuggingFace or ModelScope model id"),
+    gpu: str | None = typer.Option(None, "--gpu", help="GPU type, e.g. H800, A100-80G"),
+    engine: str = typer.Option("vllm", "--engine", help="Inference engine: vllm | sglang"),
+    gpu_count: int | None = typer.Option(
+        None, "--gpu-count", help="Force GPU count (otherwise tool recommends)"
+    ),
+    context_length: int | None = typer.Option(
+        None, "--context-length", help="Context length for KV cache estimation"
+    ),
+    refresh: bool = typer.Option(False, "--refresh", help="Bypass cache and re-fetch"),
+    lang: str | None = typer.Option(
+        None,
+        "--lang",
+        help="Output language: en | zh (default auto-detects from LANG env)",
+    ),
+    list_gpus: bool = typer.Option(
+        False,
+        "--list-gpus",
+        help="List all supported GPUs and exit (no model_id needed)",
+    ),
+    benchmark: bool = typer.Option(
+        False,
+        "--benchmark",
+        help=(
+            "Run the curated benchmark dataset: compare tool output against "
+            "reference values from HF API, model cards, vLLM recipes. "
+            "Requires network. Exit 0 on all-pass, 1 if any FAIL."
+        ),
+    ),
+    input_tokens: int = typer.Option(
+        2000,
+        "--input-tokens",
+        help="Input token budget for prefill-latency estimation (default: 2000).",
+    ),
+    output_tokens: int = typer.Option(
+        512,
+        "--output-tokens",
+        help="Output token budget for total-latency math (default: 512).",
+    ),
+    target_tokens_per_sec: float = typer.Option(
+        30.0,
+        "--target-tokens-per-sec",
+        help="SLA: per-user decode tokens/second (drives L bound). Default: 30.",
+    ),
+    prefill_util: float = typer.Option(
+        0.40,
+        "--prefill-util",
+        help="Compute utilization factor for prefill (empirical, default 0.40).",
+    ),
+    decode_bw_util: float = typer.Option(
+        0.50,
+        "--decode-bw-util",
+        help="Memory-bandwidth utilization factor for decode (default 0.50).",
+    ),
+    concurrency_degradation: float = typer.Option(
+        1.0,
+        "--concurrency-degradation",
+        help=(
+            "High-concurrency throughput degradation factor (default 1.0 = "
+            "no degradation — the honest baseline). If your engine drops "
+            "to 60% efficiency under load, pass 1.67. See docs/methodology.md."
+        ),
+    ),
+    explain: bool = typer.Option(
+        False,
+        "--explain",
+        help=(
+            "Print the full derivation trace (formula, inputs, step-by-step, "
+            "source) for every non-trivial number. Feed the output to an LLM "
+            "if you want a second opinion on the math."
+        ),
+    ),
+    llm_review: bool = typer.Option(
+        False,
+        "--llm-review",
+        help=(
+            "EXPERIMENTAL: send the derivation trace to an LLM for a second "
+            "opinion. Output is tagged [llm-opinion] and never overrides the "
+            "6 primary labels. Requires env vars: LLM_CAL_REVIEWER_API_KEY "
+            "(required), LLM_CAL_REVIEWER_BASE_URL (default OpenAI), "
+            "LLM_CAL_REVIEWER_MODEL (default gpt-4o)."
+        ),
+    ),
+    source: str = typer.Option(
+        "huggingface",
+        "--source",
+        help=(
+            "Model source: huggingface (default) | modelscope. "
+            "Auth via HF_TOKEN or MODELSCOPE_API_TOKEN env var."
+        ),
+    ),
+) -> None:
+    """Evaluate a model against target hardware."""
+    if lang in ("en", "zh"):
+        set_locale(lang)  # type: ignore[arg-type]
+    # Meta commands short-circuit before requiring model_id + --gpu.
+    if list_gpus:
+        render_gpu_list(load_database(), _console)
+        return
+    if benchmark:
+        results = run_all()
+        render_results(results, _console)
+        sys.exit(exit_code_from(results))
+    if not model_id:
+        _err.print("[red]Missing argument MODEL_ID. Use --help for usage.[/red]")
+        raise typer.Exit(code=1)
+    if not gpu:
+        _err.print("[red]Missing option --gpu. Use --list-gpus to see choices.[/red]")
+        raise typer.Exit(code=1)
+    src_obj: ModelSource
+    src_lower = source.lower()
+    if src_lower in ("hf", "huggingface"):
+        src_obj = HuggingFaceSource()
+    elif src_lower in ("ms", "modelscope"):
+        src_obj = ModelScopeSource()
+    else:
+        _err.print(
+            f"[red]Unknown --source '{source}'. Use 'huggingface' or 'modelscope'.[/red]"
+        )
+        raise typer.Exit(code=1)
+    evaluator = Evaluator(source=src_obj)
+    try:
+        report = evaluator.evaluate(
+            model_id=model_id,
+            gpu=gpu,
+            engine=engine,
+            gpu_count=gpu_count,
+            context_length=context_length,
+            refresh=refresh,
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            target_tokens_per_sec=target_tokens_per_sec,
+            prefill_utilization=prefill_util,
+            decode_bw_utilization=decode_bw_util,
+            concurrency_degradation=concurrency_degradation,
+        )
+    except AuthRequiredError as e:
+        _err.print(f"[bold red]{t('cli.err.auth_required')}[/bold red] {e}")
+        sys.exit(2)
+    except ModelNotFoundError as e:
+        _err.print(f"[bold red]{t('cli.err.model_not_found')}[/bold red] {e}")
+        sys.exit(3)
+    except SourceUnavailableError as e:
+        _err.print(f"[bold red]{t('cli.err.source_unavailable')}[/bold red] {e}")
+        sys.exit(4)
+    render(report, _console)
+    explain_entries = build_explain(report) if (explain or llm_review) else []
+    if explain:
+        render_explain(explain_entries, _console)
+    if llm_review:
+        # Locale at this point has been resolved by set_locale() calls above.
+        result = run_review(explain_entries, locale=get_locale())
+        render_llm_review(result, _console)
+if __name__ == "__main__":
+    app()

src/llm_cal/command_generator/__init__.py ADDED Viewed

File without changes

src/llm_cal/command_generator/sglang.py ADDED Viewed

	@@ -0,0 +1,50 @@

+"""Generate a ready-to-copy SGLang launch command."""
+from __future__ import annotations
+from llm_cal.architecture.profile import ArchitectureProfile
+from llm_cal.engine_compat.loader import EngineCompatEntry
+def generate_sglang_command(
+    model_id: str,
+    profile: ArchitectureProfile,
+    tensor_parallel_size: int,
+    entry: EngineCompatEntry | None,
+    max_model_len: int | None = None,
+) -> str:
+    """Generate a multi-line `python -m sglang.launch_server ...` command string."""
+    lines: list[str] = [
+        "python -m sglang.launch_server",
+        f"  --model-path {model_id}",
+        f"  --tp {tensor_parallel_size}",
+    ]
+    effective_max = max_model_len
+    if effective_max is None and profile.position is not None:
+        effective_max = profile.position.max_position_embeddings
+    if effective_max:
+        lines.append(f"  --context-length {effective_max}")
+    if _needs_trust_remote_code(profile.model_type):
+        lines.append("  --trust-remote-code")
+    lines.append("  --mem-fraction-static 0.9")
+    if entry is not None:
+        for flag in entry.required_flags:
+            lines.append("  " + _render_flag(flag.flag, flag.value))
+        for flag in entry.optional_flags:
+            lines.append("  " + _render_flag(flag.flag, flag.value))
+    return " \\\n".join(lines)
+def _render_flag(flag: str, value: str | None) -> str:
+    if value is None:
+        return flag
+    return f"{flag} {value}"
+def _needs_trust_remote_code(model_type: str) -> bool:
+    return model_type.startswith(("deepseek", "qwen2_moe", "qwen3_moe", "mixtral"))

src/llm_cal/command_generator/vllm.py ADDED Viewed

	@@ -0,0 +1,55 @@

+"""Generate a ready-to-copy vllm serve command."""
+from __future__ import annotations
+from llm_cal.architecture.profile import ArchitectureProfile
+from llm_cal.engine_compat.loader import EngineCompatEntry
+def generate_vllm_command(
+    model_id: str,
+    profile: ArchitectureProfile,
+    tensor_parallel_size: int,
+    entry: EngineCompatEntry | None,
+    max_model_len: int | None = None,
+) -> str:
+    """Generate a multi-line `vllm serve ...` command string.
+    If `entry` is given, appends required_flags and optional_flags verbatim.
+    """
+    lines: list[str] = [
+        "vllm serve " + model_id,
+        f"  --tensor-parallel-size {tensor_parallel_size}",
+    ]
+    # Pick max-model-len from profile if caller didn't override.
+    effective_max = max_model_len
+    if effective_max is None and profile.position is not None:
+        effective_max = profile.position.max_position_embeddings
+    if effective_max:
+        lines.append(f"  --max-model-len {effective_max}")
+    # DeepSeek and friends need trust-remote-code. Heuristic: non-trivial model_type.
+    if _needs_trust_remote_code(profile.model_type):
+        lines.append("  --trust-remote-code")
+    lines.append("  --gpu-memory-utilization 0.9")
+    if entry is not None:
+        for flag in entry.required_flags:
+            lines.append("  " + _render_flag(flag.flag, flag.value))
+        for flag in entry.optional_flags:
+            lines.append("  " + _render_flag(flag.flag, flag.value))
+    return " \\\n".join(lines)
+def _render_flag(flag: str, value: str | None) -> str:
+    if value is None:
+        return flag
+    return f"{flag} {value}"
+def _needs_trust_remote_code(model_type: str) -> bool:
+    """Models that ship custom modeling code in the repo."""
+    return model_type.startswith(("deepseek", "qwen2_moe", "qwen3_moe", "mixtral"))

src/llm_cal/common/__init__.py ADDED Viewed

File without changes

src/llm_cal/common/i18n.py ADDED Viewed

	@@ -0,0 +1,421 @@

+"""Minimal i18n layer. No gettext, no external deps.
+Supports `en` and `zh`. Defaults to `en` but auto-detects from LC_ALL/LANG
+when they start with `zh` (covers zh_CN, zh_TW, zh_HK, etc.).
+Usage:
+    from llm_cal.common.i18n import t, set_locale
+    set_locale("zh")
+    print(t("labels.legend"))   # "标签"
+"""
+from __future__ import annotations
+import os
+from typing import Literal
+Locale = Literal["en", "zh"]
+_current_locale: Locale = "en"
+_MESSAGES: dict[str, dict[Locale, str]] = {
+    # CLI help text
+    "cli.help": {
+        "en": "LLM inference hardware calculator.",
+        "zh": "大模型推理硬件计算器。",
+    },
+    "cli.arg.model_id": {
+        "en": "HuggingFace or ModelScope model id",
+        "zh": "HuggingFace 或 ModelScope 的 model id",
+    },
+    "cli.opt.gpu": {
+        "en": "GPU type, e.g. H800, A100-80G",
+        "zh": "GPU 型号，例如 H800、A100-80G",
+    },
+    "cli.opt.engine": {
+        "en": "Inference engine: vllm | sglang",
+        "zh": "推理引擎：vllm | sglang",
+    },
+    "cli.opt.gpu_count": {
+        "en": "Force GPU count (otherwise tool recommends min/dev/prod)",
+        "zh": "强制指定 GPU 张数（默认由工具推荐 min/dev/prod 三档）",
+    },
+    "cli.opt.context_length": {
+        "en": "Context length for KV cache estimation",
+        "zh": "用于 KV cache 估算的上下文长度",
+    },
+    "cli.opt.refresh": {
+        "en": "Bypass cache and re-fetch",
+        "zh": "绕过缓存重新拉取",
+    },
+    "cli.opt.lang": {
+        "en": "Output language: en | zh",
+        "zh": "输出语言：en | zh",
+    },
+    "cli.err.auth_required": {
+        "en": "Authentication required:",
+        "zh": "需要认证：",
+    },
+    "cli.err.model_not_found": {
+        "en": "Model not found:",
+        "zh": "模型未找到：",
+    },
+    "cli.err.source_unavailable": {
+        "en": "Source unavailable:",
+        "zh": "数据源不可用：",
+    },
+    # Panel / section titles
+    "panel.via": {"en": "via", "zh": "来源"},
+    "section.architecture": {"en": "Architecture", "zh": "架构"},
+    "section.weights": {"en": "Weights", "zh": "权重"},
+    "section.kv_cache": {
+        "en": "KV cache per request (BF16/FP16)",
+        "zh": "单请求 KV Cache（BF16/FP16）",
+    },
+    "section.reconciliation": {
+        "en": "Quantization reconciliation (observed vs predicted per scheme)",
+        "zh": "量化方案对账（观测值 vs 各方案预测值）",
+    },
+    "section.engine_compat": {
+        "en": "Engine compatibility",
+        "zh": "推理引擎兼容性",
+    },
+    "section.hardware": {"en": "Target hardware", "zh": "目标硬件"},
+    "section.labels": {"en": "labels:", "zh": "标签："},
+    # Architecture row labels
+    "arch.model_type": {"en": "model_type", "zh": "模型类型"},
+    "arch.family": {"en": "family", "zh": "架构族"},
+    "arch.confidence": {"en": "confidence", "zh": "识别置信度"},
+    "arch.layers": {"en": "layers", "zh": "层数"},
+    "arch.hidden_size": {"en": "hidden_size", "zh": "隐藏维度"},
+    "arch.vocab_size": {"en": "vocab_size", "zh": "词表大小"},
+    "arch.attention": {"en": "attention", "zh": "注意力机制"},
+    "arch.compress_ratios": {"en": "compress_ratios", "zh": "压缩比数组"},
+    "arch.moe": {"en": "moe", "zh": "MoE"},
+    "arch.sliding_window": {"en": "sliding_window", "zh": "滑动窗口"},
+    "arch.max_position": {
+        "en": "max_position_embeddings",
+        "zh": "最大上下文长度",
+    },
+    "arch.none": {"en": "(none)", "zh": "（无）"},
+    "arch.compress_ratios_summary": {
+        "en": "len={n}, dense_layers={dense}",
+        "zh": "长度={n}，dense 层数={dense}",
+    },
+    "arch.moe_summary": {
+        "en": "{routed} routed + {shared} shared, top-{topk}",
+        "zh": "{routed} 个 routed + {shared} 个 shared，top-{topk}",
+    },
+    "arch.attn_summary": {
+        "en": "{variant} (heads={heads}, kv_heads={kv_heads}, head_dim={head_dim})",
+        "zh": "{variant}（heads={heads}，kv_heads={kv_heads}，head_dim={head_dim}）",
+    },
+    "arch.unsupported_state_space": {
+        "en": "State-space models are not supported in v0.1 (planned for v0.3+).",
+        "zh": "状态空间模型（Mamba 类）在 v0.1 暂不支持，计划在 v0.3+ 加入。",
+    },
+    # Weights rows
+    "weights.safetensors_bytes": {
+        "en": "safetensors bytes",
+        "zh": "safetensors 总字节",
+    },
+    "weights.params_estimated": {
+        "en": "estimated total params",
+        "zh": "参数量（估算）",
+    },
+    "weights.bits_per_param": {"en": "bits/param", "zh": "每参数位数"},
+    "weights.quant_guess": {"en": "quantization guess", "zh": "量化方案推断"},
+    # Reconciliation
+    "recon.scheme": {"en": "scheme", "zh": "量化方案"},
+    "recon.predicted": {"en": "predicted bytes", "zh": "预测字节"},
+    "recon.delta": {"en": "delta", "zh": "差值"},
+    "recon.error_pct": {"en": "error %", "zh": "误差 %"},
+    "recon.over": {"en": "over", "zh": "偏高"},
+    "recon.under": {"en": "under", "zh": "偏低"},
+    "recon.best": {"en": "best match:", "zh": "最佳匹配："},
+    # KV cache
+    "kv.context": {"en": "context", "zh": "上下文"},
+    "kv.kv_cache": {"en": "KV cache", "zh": "KV Cache"},
+    "kv.label": {"en": "label", "zh": "标签"},
+    "kv.tokens": {"en": "tokens", "zh": "tokens"},
+    # Engine compatibility
+    "engine.version_spec": {"en": "version", "zh": "版本要求"},
+    "engine.support": {"en": "support", "zh": "支持程度"},
+    "engine.verification": {"en": "verification", "zh": "验证等级"},
+    "engine.required_flags": {"en": "required flags", "zh": "必需参数"},
+    "engine.optional_flags": {"en": "optional flags", "zh": "可选参数"},
+    "engine.caveats": {"en": "caveats", "zh": "注意事项"},
+    "engine.sources": {"en": "sources", "zh": "来源"},
+    "engine.no_match": {
+        "en": "No compatibility entry for this model + engine in v0.1 matrix.",
+        "zh": "v0.1 兼容矩阵中暂无此模型 + 引擎的条目。",
+    },
+    # Hardware
+    "hw.memory": {"en": "memory", "zh": "显存"},
+    "hw.nvlink_bandwidth": {"en": "NVLink bandwidth", "zh": "NVLink 带宽"},
+    "hw.fp16_tflops": {"en": "FP16 TFLOPS", "zh": "FP16 算力"},
+    "hw.fp8_support": {"en": "FP8 support", "zh": "FP8 支持"},
+    "hw.fp4_support": {"en": "FP4 support", "zh": "FP4 支持"},
+    "hw.notes": {"en": "notes", "zh": "备注"},
+    "hw.spec_source": {"en": "spec source", "zh": "规格来源"},
+    # GPU list subcommand
+    "gpus.list.title": {
+        "en": "Supported GPUs",
+        "zh": "支持的 GPU",
+    },
+    "gpus.col.id": {"en": "id", "zh": "型号"},
+    "gpus.col.memory": {"en": "memory", "zh": "显存"},
+    "gpus.col.nvlink": {"en": "NVLink / fabric", "zh": "互联带宽"},
+    "gpus.col.fp16": {"en": "FP16 TFLOPS", "zh": "FP16"},
+    "gpus.col.fp8": {"en": "FP8", "zh": "FP8"},
+    "gpus.col.fp4": {"en": "FP4", "zh": "FP4"},
+    "gpus.col.aliases": {"en": "aliases", "zh": "别名"},
+    "gpus.total": {
+        "en": "Total: {count} GPUs (pass any id or alias to --gpu)",
+        "zh": "共 {count} 款（--gpu 后面填 ID 或别名均可）",
+    },
+    "hw.unknown": {
+        "en": "Unknown GPU '{gpu}'. Known: {known}",
+        "zh": "未知 GPU '{gpu}'。已知型号：{known}",
+    },
+    "hw.bool_yes": {"en": "yes", "zh": "是"},
+    "hw.bool_no": {"en": "no", "zh": "否"},
+    # Labels — localized display names. Enum identity stays English.
+    "label.verified": {"en": "verified", "zh": "已验证"},
+    "label.inferred": {"en": "inferred", "zh": "推断"},
+    "label.estimated": {"en": "estimated", "zh": "估算"},
+    "label.cited": {"en": "cited", "zh": "引用"},
+    "label.unverified": {"en": "unverified", "zh": "未经验证"},
+    "label.unknown": {"en": "unknown", "zh": "未知"},
+    "label.llm-opinion": {"en": "llm-opinion", "zh": "LLM 观点"},
+    # Source attribution
+    "source.pr": {"en": "PR", "zh": "PR"},
+    "source.release_notes": {"en": "release notes", "zh": "release note"},
+    "source.announcement": {"en": "announcement", "zh": "官方公告"},
+    "source.tested": {"en": "tested", "zh": "实测"},
+    "source.captured_on": {"en": "captured on", "zh": "采集于"},
+    # Fleet planner
+    "section.fleet": {
+        "en": "Recommended fleet",
+        "zh": "推荐 GPU 张数",
+    },
+    "fleet.col.tier": {"en": "tier", "zh": "档位"},
+    "fleet.col.gpus": {"en": "GPUs", "zh": "GPU 数"},
+    "fleet.col.weight_per_gpu": {
+        "en": "weight / GPU",
+        "zh": "单卡权重",
+    },
+    "fleet.col.headroom_per_gpu": {
+        "en": "headroom / GPU",
+        "zh": "单卡余量",
+    },
+    "fleet.col.fit": {"en": "fit", "zh": "评估"},
+    "fleet.col.concurrent_at_ctx": {
+        "en": "concurrent @ {ctx}",
+        "zh": "并发 @ {ctx}",
+    },
+    "fleet.tier.min": {"en": "min", "zh": "最小"},
+    "fleet.tier.dev": {"en": "dev", "zh": "开发"},
+    "fleet.tier.prod": {"en": "prod", "zh": "生产"},
+    "fleet.best_marker": {
+        "en": "= recommended",
+        "zh": "= 推荐档位",
+    },
+    "fleet.constraint": {"en": "constraint:", "zh": "约束："},
+    "fleet.forced": {
+        "en": "Forced GPU count (--gpu-count was set)",
+        "zh": "已强制指定 GPU 张数（--gpu-count）",
+    },
+    "fleet.gpu_spec_unknown": {
+        "en": "Fleet planning skipped — GPU spec unknown.",
+        "zh": "GPU 规格未知，跳过 fleet 规划。",
+    },
+    # Command generator
+    "section.command": {
+        "en": "Generated command",
+        "zh": "生成的启动命令",
+    },
+    "command.tier_note": {
+        "en": "tier: {tier} ({gpus} GPUs)",
+        "zh": "档位：{tier}（{gpus} 张）",
+    },
+    # Performance section
+    "section.performance": {
+        "en": "Performance analysis",
+        "zh": "性能分析",
+    },
+    "perf.assumptions_note": {
+        "en": (
+            "Assumes input={input_tokens} tokens, output={output_tokens} tokens, "
+            "target {target_tps} tok/s per user. "
+            "Utilization: prefill={prefill_util:.0%} / decode_bw={decode_util:.0%} "
+            "/ concurrency_degradation={degradation:.2f}x. "
+            "All numbers are [estimated] — see docs/methodology.md for formula sources "
+            "and override via --prefill-util / --decode-bw-util / --concurrency-degradation."
+        ),
+        "zh": (
+            "假设输入 {input_tokens} tokens、输出 {output_tokens} tokens、"
+            "每用户目标 {target_tps} tok/s。"
+            "利用率：prefill={prefill_util:.0%} / decode_bw={decode_util:.0%} "
+            "/ 并发退化={degradation:.2f}x。"
+            "所有数字都是 [估算]——公式来源见 docs/methodology.md，"
+            "可通过 --prefill-util / --decode-bw-util / --concurrency-degradation 覆盖。"
+        ),
+    },
+    "perf.prefill_latency": {
+        "en": "Prefill latency (single request)",
+        "zh": "Prefill 延迟（单请求）",
+    },
+    "perf.decode_throughput_cluster": {
+        "en": "Decode throughput (cluster)",
+        "zh": "Decode 吞吐（集群）",
+    },
+    "perf.decode_throughput_per_gpu": {
+        "en": "Decode throughput (per GPU)",
+        "zh": "Decode 吞吐（单卡）",
+    },
+    "perf.decode_moe_active_optimistic": {
+        "en": "Decode throughput (MoE active-only, optimistic)",
+        "zh": "Decode 吞吐（MoE 仅激活专家，乐观估算）",
+    },
+    "perf.k_bound": {
+        "en": "K bound (memory-capacity)",
+        "zh": "K 上限（显存容量）",
+    },
+    "perf.l_bound": {
+        "en": "L bound (compute / bandwidth @ SLA)",
+        "zh": "L 上限（算力/带宽 @ SLA）",
+    },
+    "perf.max_concurrent": {
+        "en": "Max concurrent",
+        "zh": "最大并发",
+    },
+    "perf.bottleneck": {
+        "en": "Bottleneck",
+        "zh": "瓶颈类型",
+    },
+    "perf.bottleneck.memory_capacity": {
+        "en": "Memory capacity",
+        "zh": "显存容量",
+    },
+    "perf.bottleneck.memory_bandwidth": {
+        "en": "Memory bandwidth / compute",
+        "zh": "显存带宽 / 算力",
+    },
+    "perf.bottleneck.compute": {
+        "en": "Compute",
+        "zh": "算力",
+    },
+    "perf.bottleneck.insufficient_data": {
+        "en": "Insufficient data",
+        "zh": "数据不足",
+    },
+    "perf.optimization.header": {
+        "en": "Optimization suggestions",
+        "zh": "优化建议",
+    },
+    "perf.opt.quantize_int4": {
+        "en": "Quantize to INT4: weight bytes halve → decode tok/s roughly 2× → concurrency scales accordingly.",
+        "zh": "量化到 INT4：权重字节减半 → decode tok/s 约翻倍 → 并发能力随之提升。",
+    },
+    "perf.opt.relax_sla": {
+        "en": "Relax SLA: if per-user target drops to 15 tok/s, L bound roughly doubles.",
+        "zh": "放宽 SLA：若每用户目标降至 15 tok/s，L 上限约翻倍。",
+    },
+    "perf.opt.kv_fp8": {
+        "en": "KV cache FP8 quantization: halves per-request KV, doubles the K bound at long context.",
+        "zh": "KV cache 量化到 FP8：单请求 KV 减半，长上下文下 K 上限约翻倍。",
+    },
+    "perf.opt.moe_offload": {
+        "en": "MoE expert offload to CPU: frees HBM for more KV cache at the cost of PCIe latency per new expert.",
+        "zh": "MoE 专家卸载到 CPU：释放 HBM 给 KV cache，代价是新专家激活时的 PCIe 延迟。",
+    },
+    # Explain section
+    "section.explain": {
+        "en": "Full derivation traces (--explain)",
+        "zh": "完整推导链（--explain）",
+    },
+    "explain.formula": {"en": "Formula", "zh": "公式"},
+    "explain.inputs": {"en": "Inputs", "zh": "输入"},
+    "explain.steps": {"en": "Computation", "zh": "计算步骤"},
+    "explain.result": {"en": "Result", "zh": "结果"},
+    "explain.source": {"en": "Source", "zh": "来源"},
+    "explain.see_also": {"en": "See also", "zh": "延伸阅读"},
+    "explain.intro": {
+        "en": (
+            "Each entry below shows the formula used, the inputs that went in, "
+            "every computation step, and the primary source. "
+            "Paste any single entry into an LLM and ask 'does this math check out?' "
+            "— the tool stays deterministic, the second opinion is yours."
+        ),
+        "zh": (
+            "下面每一项都给出所用公式、输入、每一步计算、主要来源。"
+            "把任一项复制粘贴给 LLM，问『这个推理对吗』即可。"
+            "工具保持确定性，second opinion 交给你。"
+        ),
+    },
+    # LLM review section
+    "section.llm_review": {
+        "en": "LLM second opinion (--llm-review, EXPERIMENTAL)",
+        "zh": "LLM 审阅（--llm-review，实验性）",
+    },
+    "llm_review.disclaimer": {
+        "en": (
+            "⚠  This is a second opinion from an external LLM ({model} via {base_url}). "
+            "It is tagged [llm-opinion] and NEVER overrides the 6 primary labels. "
+            "LLMs can be wrong; the tool's deterministic output takes precedence."
+        ),
+        "zh": (
+            "⚠  以下是来自外部 LLM（{model}，经 {base_url}）的第二意见。"
+            "标签为 [LLM 观点]，**永远不覆盖** 前 6 级主标签。"
+            "LLM 可能出错；工具的确定性输出优先。"
+        ),
+    },
+    "llm_review.unavailable": {
+        "en": "LLM review unavailable: {error}",
+        "zh": "LLM 审阅不可用：{error}",
+    },
+    "llm_review.setup_hint": {
+        "en": (
+            "To enable: export LLM_CAL_REVIEWER_API_KEY=<key>  "
+            "[optional: LLM_CAL_REVIEWER_BASE_URL, LLM_CAL_REVIEWER_MODEL]"
+        ),
+        "zh": (
+            "启用方法：export LLM_CAL_REVIEWER_API_KEY=<key>  "
+            "[可选：LLM_CAL_REVIEWER_BASE_URL、LLM_CAL_REVIEWER_MODEL]"
+        ),
+    },
+}
+def set_locale(loc: Locale) -> None:
+    global _current_locale
+    _current_locale = loc
+def get_locale() -> Locale:
+    return _current_locale
+def detect_locale_from_env() -> Locale:
+    """Auto-detect from standard locale env vars."""
+    for var in ("LC_ALL", "LC_MESSAGES", "LANG"):
+        val = os.environ.get(var, "").lower()
+        if val.startswith("zh"):
+            return "zh"
+    return "en"
+def t(key: str, **kwargs: object) -> str:
+    """Translate a message key. Unknown keys return the key itself (fail loud)."""
+    bundle = _MESSAGES.get(key)
+    if bundle is None:
+        return key
+    template = bundle.get(_current_locale, bundle.get("en", key))
+    if kwargs:
+        try:
+            return template.format(**kwargs)
+        except (KeyError, IndexError):
+            return template
+    return template

src/llm_cal/common/yaml_loader.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""Pydantic-validated YAML loader.
+Shared between engine_compat and hardware modules. Supports `lazy=True` param
+(v0.1 does not implement lazy — signature reserved for v0.2 when matrix > 100).
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import TypeVar
+import yaml
+from pydantic import BaseModel, ValidationError
+T = TypeVar("T", bound=BaseModel)
+class YamlLoadError(Exception):
+    """YAML file could not be parsed or validated."""
+def load_yaml(path: str | Path, schema: type[T], *, lazy: bool = False) -> T:
+    """Load + validate a YAML file against a Pydantic schema.
+    Args:
+        path: YAML file to load.
+        schema: Pydantic model the YAML is expected to conform to.
+        lazy: Reserved for v0.2 (on-demand loading of large matrices). v0.1
+              ignores this; document-scale data is small enough that eager
+              loading is fine.
+    """
+    _ = lazy  # v0.1 behavior is always eager
+    p = Path(path)
+    if not p.exists():
+        raise YamlLoadError(f"YAML file not found: {p}")
+    try:
+        with p.open("r", encoding="utf-8") as f:
+            raw = yaml.safe_load(f)
+    except yaml.YAMLError as e:
+        raise YamlLoadError(f"YAML parse error in {p}: {e}") from e
+    if raw is None:
+        raise YamlLoadError(f"YAML file {p} is empty")
+    try:
+        return schema.model_validate(raw)
+    except ValidationError as e:
+        raise YamlLoadError(f"Schema validation failed for {p}:\n{e}") from e

src/llm_cal/core/__init__.py ADDED Viewed

File without changes

src/llm_cal/core/cache.py ADDED Viewed

	@@ -0,0 +1,97 @@

+"""Disk cache for model-source responses.
+Key design decisions (from /plan-eng-review Issue #2 + Issue #10 critical):
+- Key = (source, model_id, commit_sha). Commit sha is included so a repo update
+  invalidates cache automatically — prevents the critical regression of serving
+  stale data after the upstream model updates.
+- TTL = 7 days default. Even without a commit change, we force re-fetch weekly.
+- `--refresh` flag sets `bypass=True` on `get()` — caller drives it.
+- Store location: platformdirs user cache dir, subdirectory `llm-cal`.
+"""
+from __future__ import annotations
+from dataclasses import asdict, dataclass, is_dataclass
+from pathlib import Path
+from typing import Any
+import diskcache
+from platformdirs import user_cache_dir
+from llm_cal.model_source.base import ModelArtifact, SiblingFile
+_DEFAULT_TTL_SECONDS = 7 * 24 * 60 * 60  # 7 days
+@dataclass(frozen=True)
+class CacheKey:
+    source: str
+    model_id: str
+    commit_sha: str | None
+    def to_string(self) -> str:
+        return f"{self.source}::{self.model_id}::{self.commit_sha or 'HEAD'}"
+class ArtifactCache:
+    """Persistent cache for ModelArtifact instances."""
+    def __init__(
+        self, cache_dir: str | Path | None = None, ttl_seconds: int = _DEFAULT_TTL_SECONDS
+    ) -> None:
+        if cache_dir is None:
+            cache_dir = user_cache_dir("llm-cal", appauthor=False)
+        Path(cache_dir).mkdir(parents=True, exist_ok=True)
+        self._cache = diskcache.Cache(str(cache_dir))
+        self._ttl = ttl_seconds
+    def get(self, key: CacheKey, bypass: bool = False) -> ModelArtifact | None:
+        """Look up an artifact. `bypass=True` always returns None (used by --refresh).
+        If `key.commit_sha` is None (no revision pinning), we never serve from cache
+        because we can't prove freshness.
+        """
+        if bypass or key.commit_sha is None:
+            return None
+        raw = self._cache.get(key.to_string())
+        if raw is None:
+            return None
+        return _deserialize_artifact(raw)
+    def set(self, key: CacheKey, artifact: ModelArtifact) -> None:
+        """Cache an artifact. No-op if commit_sha is None (can't guarantee freshness)."""
+        if key.commit_sha is None:
+            return
+        self._cache.set(key.to_string(), _serialize_artifact(artifact), expire=self._ttl)
+    def invalidate(self, key: CacheKey) -> bool:
+        """Explicit invalidation, returns True if something was removed."""
+        return bool(self._cache.delete(key.to_string()))
+    def clear(self) -> None:
+        """Wipe the whole cache — for tests and `llm-cal cache clear` (future)."""
+        self._cache.clear()
+    def close(self) -> None:
+        self._cache.close()
+def _serialize_artifact(a: ModelArtifact) -> dict[str, Any]:
+    return {
+        "source": a.source,
+        "model_id": a.model_id,
+        "commit_sha": a.commit_sha,
+        "config": a.config,
+        "siblings": [asdict(s) if is_dataclass(s) else s for s in a.siblings],
+    }
+def _deserialize_artifact(raw: dict[str, Any]) -> ModelArtifact:
+    return ModelArtifact(
+        source=raw["source"],
+        model_id=raw["model_id"],
+        commit_sha=raw["commit_sha"],
+        config=raw["config"],
+        siblings=tuple(SiblingFile(**s) for s in raw["siblings"]),
+    )

src/llm_cal/core/evaluator.py ADDED Viewed

	@@ -0,0 +1,375 @@

+"""Evaluator — the single orchestration layer.
+v0.1 partial implementation: composes model_source + detector + weight_analyzer
++ reconciler + kv_cache + engine_compat + hardware. Fleet planner and command
+generator land in Week 5 remainder.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from llm_cal.architecture.detector import detect
+from llm_cal.architecture.formulas.kv_cache import compute_kv_cache_bytes
+from llm_cal.architecture.formulas.weight import estimate_total_params
+from llm_cal.architecture.profile import ArchitectureProfile
+from llm_cal.command_generator.sglang import generate_sglang_command
+from llm_cal.command_generator.vllm import generate_vllm_command
+from llm_cal.core.cache import ArtifactCache, CacheKey
+from llm_cal.engine_compat.loader import EngineCompatEntry, find_match
+from llm_cal.fleet.planner import FleetRecommendation, plan
+from llm_cal.hardware.loader import GPUSpec, UnknownGPUError, lookup
+from llm_cal.model_source.base import ModelArtifact, ModelSource
+from llm_cal.model_source.huggingface import HuggingFaceSource
+from llm_cal.output.labels import AnnotatedValue
+from llm_cal.performance.compute import (
+    DEFAULT_DECODE_BW_UTILIZATION,
+    DEFAULT_PREFILL_UTILIZATION,
+    DecodeEstimate,
+    PrefillEstimate,
+    estimate_decode,
+    estimate_prefill,
+)
+from llm_cal.performance.concurrency import ConcurrencyAnalysis
+from llm_cal.performance.concurrency import analyze as analyze_concurrency
+from llm_cal.weight_analyzer import WeightReport, analyze
+from llm_cal.weight_analyzer.fingerprint import (
+    QuantFingerprint,
+    from_config,
+    from_safetensors_dtypes,
+)
+from llm_cal.weight_analyzer.reconciler import ReconciliationReport, reconcile
+from llm_cal.weight_analyzer.safetensors_reader import (
+    fetch_tensor_dtypes,
+    pick_sample_shard,
+)
+_KV_REFERENCE_CTX = 131_072  # matches fleet.planner's _REFERENCE_CTX_TOKENS
+@dataclass(frozen=True)
+class EvaluationReport:
+    """Everything the evaluator produces for one model."""
+    model_id: str
+    source: str
+    commit_sha: str | None
+    gpu: str
+    gpu_spec: GPUSpec | None
+    gpu_error: str | None  # message if gpu wasn't found
+    engine: str
+    profile: ArchitectureProfile
+    weight: WeightReport
+    total_params_estimate: AnnotatedValue[int]
+    reconciliation: ReconciliationReport
+    kv_cache_by_context: dict[int, AnnotatedValue[int]] = field(default_factory=dict)
+    engine_match: EngineCompatEntry | None = None
+    fleet: FleetRecommendation | None = None
+    generated_command: str | None = None
+    # Performance analysis — filled when user passes SLA args (or defaults).
+    prefill: PrefillEstimate | None = None
+    decode: DecodeEstimate | None = None
+    concurrency: ConcurrencyAnalysis | None = None
+    perf_input_tokens: int | None = None
+    perf_output_tokens: int | None = None
+    perf_target_tokens_per_sec: float | None = None
+class Evaluator:
+    """Orchestrates: model_source -> detect -> analyze -> reconcile -> KV cache
+    -> engine compat -> hardware lookup.
+    Fleet planning and command generation are remaining Week 5 additions.
+    """
+    def __init__(
+        self,
+        source: ModelSource | None = None,
+        cache: ArtifactCache | None = None,
+    ) -> None:
+        self._source = source or HuggingFaceSource()
+        self._cache = cache or ArtifactCache()
+    def evaluate(
+        self,
+        model_id: str,
+        gpu: str,
+        engine: str,
+        gpu_count: int | None = None,
+        context_length: int | None = None,
+        refresh: bool = False,
+        input_tokens: int | None = None,
+        output_tokens: int | None = None,
+        target_tokens_per_sec: float | None = None,
+        prefill_utilization: float = DEFAULT_PREFILL_UTILIZATION,
+        decode_bw_utilization: float = DEFAULT_DECODE_BW_UTILIZATION,
+        concurrency_degradation: float = 1.0,
+    ) -> EvaluationReport:
+        artifact = self._fetch(model_id, refresh=refresh)
+        profile = detect(artifact.config)
+        total_params_est = estimate_total_params(profile)
+        total_params = total_params_est.value
+        observed_bytes_for_fp = sum(
+            (s.size or 0) for s in artifact.siblings if s.filename.endswith(".safetensors")
+        )
+        fingerprint = self._resolve_quant_fingerprint(
+            artifact,
+            observed_bytes=observed_bytes_for_fp,
+            total_params=total_params if total_params > 0 else 0,
+        )
+        weight = analyze(
+            artifact.siblings,
+            total_params=total_params if total_params > 0 else None,
+            fingerprint=fingerprint,
+        )
+        reconciliation = reconcile(
+            weight.total_bytes.value,
+            total_params or 1,
+            fingerprint=fingerprint,
+        )
+        contexts_to_report = self._select_context_lengths(profile, context_length)
+        kv_by_ctx = {
+            ctx: compute_kv_cache_bytes(profile, seq_len=ctx, dtype_bytes=2)
+            for ctx in contexts_to_report
+        }
+        # Engine compatibility — match by model_type alone (v0.1). Version
+        # filtering can be added via a future --engine-version flag.
+        engine_match = find_match(engine=engine, model_type=profile.model_type)
+        # Hardware lookup — never raises out to CLI, we embed the error message
+        # so the user sees a partial report instead of aborting.
+        gpu_spec: GPUSpec | None = None
+        gpu_error: str | None = None
+        try:
+            gpu_spec = lookup(gpu)
+        except UnknownGPUError as e:
+            gpu_error = str(e)
+        # Fleet planning — only if we have a known GPU. The planner's reference
+        # context is 128K; derive KV bytes there (computing fresh in case the
+        # user chose a non-overlapping context_length override).
+        fleet: FleetRecommendation | None = None
+        generated_command: str | None = None
+        if gpu_spec is not None and weight.total_bytes.value > 0:
+            kv_ref = compute_kv_cache_bytes(profile, _KV_REFERENCE_CTX, dtype_bytes=2)
+            kv_by_context_bytes = {ctx: av.value for ctx, av in kv_by_ctx.items() if av.value > 0}
+            fleet = plan(
+                profile=profile,
+                weight_bytes=weight.total_bytes.value,
+                kv_bytes_per_request_at_ref=max(1, kv_ref.value),
+                gpu=gpu_spec,
+                forced_gpu_count=gpu_count,
+                kv_bytes_by_context=kv_by_context_bytes,
+            )
+            # Pick the gpu_count to emit the command for: user's forced value,
+            # else the best_tier's recommendation.
+            chosen_count = gpu_count or next(
+                (o.gpu_count for o in fleet.options if o.tier == fleet.best_tier),
+                fleet.options[0].gpu_count,
+            )
+            generated_command = _generate_command(
+                engine=engine,
+                model_id=model_id,
+                profile=profile,
+                tp=chosen_count,
+                entry=engine_match,
+                max_model_len=context_length,
+            )
+        # Performance analysis — runs whenever we have hardware + fleet.
+        prefill_est: PrefillEstimate | None = None
+        decode_est: DecodeEstimate | None = None
+        concurrency_est: ConcurrencyAnalysis | None = None
+        if gpu_spec is not None and fleet is not None and total_params > 0:
+            # Pick the fleet tier we're analyzing (user's forced count or best tier).
+            chosen = gpu_count or next(
+                (o.gpu_count for o in fleet.options if o.tier == fleet.best_tier),
+                fleet.options[0].gpu_count,
+            )
+            # Resolve performance defaults when user didn't specify.
+            eff_input = input_tokens or 2000
+            eff_target = target_tokens_per_sec or 30.0
+            prefill_est = estimate_prefill(
+                profile=profile,
+                total_params=total_params,
+                gpu=gpu_spec,
+                num_gpus=chosen,
+                input_tokens=eff_input,
+                utilization=prefill_utilization,
+            )
+            # MoE active ratio: active/total = (shared + experts_per_tok) / (shared + routed)
+            moe_active_ratio: float | None = None
+            if profile.moe is not None:
+                active_experts = profile.moe.num_experts_per_tok + profile.moe.num_shared_experts
+                total_experts = profile.moe.num_routed_experts + profile.moe.num_shared_experts
+                if total_experts > 0:
+                    moe_active_ratio = active_experts / total_experts
+            decode_est = estimate_decode(
+                profile=profile,
+                total_weight_bytes=weight.total_bytes.value,
+                gpu=gpu_spec,
+                num_gpus=chosen,
+                bw_utilization=decode_bw_utilization,
+                moe_active_params_ratio=moe_active_ratio,
+            )
+            # Compute cluster headroom at the chosen tier + KV per request at the
+            # *longest* surveyed context (most conservative).
+            chosen_option = next(
+                (o for o in fleet.options if o.gpu_count == chosen),
+                fleet.options[-1],
+            )
+            headroom_per_gpu = (
+                chosen_option.usable_bytes_per_gpu - chosen_option.weight_bytes_per_gpu
+            )
+            # Cluster-wide headroom is per-GPU * N; currently we use per-GPU view below.
+            # Reference context for the L bound: match K's headroom context (128K
+            # if model supports it, else max).
+            kv_ref_ctx = 131_072 if 131_072 in kv_by_ctx else max(kv_by_ctx.keys())
+            kv_ref_bytes: int = kv_by_ctx[kv_ref_ctx].value
+            # Apply TP-aware sharding (same rule fleet planner uses).
+            from llm_cal.fleet.planner import _kv_shards
+            shards = _kv_shards(profile, chosen)
+            kv_ref_per_gpu = max(1, kv_ref_bytes // shards)
+            # Request KV lives per-GPU; under replication, it's the same value on all.
+            # We compare cluster headroom against per-GPU KV (each request consumes
+            # per-GPU KV on every rank simultaneously).
+            # To convert to "how many requests fit", we divide *per-GPU* headroom
+            # by *per-GPU* KV.
+            headroom_per_req_view = max(0, headroom_per_gpu)
+            concurrency_est = analyze_concurrency(
+                cluster_headroom_bytes=headroom_per_req_view,
+                kv_bytes_per_request=kv_ref_per_gpu,
+                decode=decode_est,
+                target_tokens_per_sec=eff_target,
+                degradation=concurrency_degradation,
+            )
+        return EvaluationReport(
+            model_id=model_id,
+            source=artifact.source,
+            commit_sha=artifact.commit_sha,
+            gpu=gpu,
+            gpu_spec=gpu_spec,
+            gpu_error=gpu_error,
+            engine=engine,
+            profile=profile,
+            weight=weight,
+            total_params_estimate=total_params_est,
+            reconciliation=reconciliation,
+            kv_cache_by_context=kv_by_ctx,
+            engine_match=engine_match,
+            fleet=fleet,
+            generated_command=generated_command,
+            prefill=prefill_est,
+            decode=decode_est,
+            concurrency=concurrency_est,
+            perf_input_tokens=input_tokens or 2000 if fleet else None,
+            perf_output_tokens=output_tokens or 512 if fleet else None,
+            perf_target_tokens_per_sec=target_tokens_per_sec or 30.0 if fleet else None,
+        )
+    def _fetch(self, model_id: str, refresh: bool) -> ModelArtifact:
+        artifact = self._source.fetch(model_id)
+        key = CacheKey(
+            source=self._source.name,
+            model_id=model_id,
+            commit_sha=artifact.commit_sha,
+        )
+        cached = self._cache.get(key, bypass=refresh)
+        if cached is not None:
+            return cached
+        self._cache.set(key, artifact)
+        return artifact
+    def _resolve_quant_fingerprint(
+        self,
+        artifact: ModelArtifact,
+        observed_bytes: int,
+        total_params: int,
+    ) -> QuantFingerprint | None:
+        """Resolve the quantization scheme via authoritative evidence.
+        Priority:
+          1. config.json `quantization_config` — explicit author declaration.
+             Free, no extra network call. But if its predicted bytes are
+             wildly off (>15% from observed), fall through — config.json
+             can be incomplete or stale (DeepSeek-V4-Flash declares
+             `quant_method=fp8` but ships an FP4+FP8 mixed pack; trusting
+             the declaration produces a 45% wrong answer).
+          2. safetensors file header — per-tensor dtype fingerprint. One
+             Range GET on the first shard. Ground truth.
+        Returns None on any failure. The reconciler falls back to bytes-only
+        argmin in that case (v0.1.1 behavior).
+        """
+        fp = from_config(artifact.config)
+        if fp is not None and self._fingerprint_matches_bytes(fp, observed_bytes, total_params):
+            return fp
+        shard = pick_sample_shard(artifact.siblings)
+        if shard is None:
+            return fp  # safetensors unavailable — best we can do is the config hint
+        dtypes = fetch_tensor_dtypes(
+            source=artifact.source,
+            model_id=artifact.model_id,
+            revision=artifact.commit_sha or "main",
+            shard_filename=shard.filename,
+        )
+        if not dtypes:
+            return fp
+        st_fp = from_safetensors_dtypes(dtypes)
+        # Header is ground truth — prefer it over config when both exist.
+        return st_fp if st_fp is not None else fp
+    @staticmethod
+    def _fingerprint_matches_bytes(
+        fp: QuantFingerprint, observed_bytes: int, total_params: int
+    ) -> bool:
+        """Sanity-check a fingerprint's predicted bytes against observed.
+        Returns True if the declared scheme's predicted bytes are within 15%
+        of observed. False means config.json is either lying or describes
+        only part of the model — we should consult safetensors instead.
+        """
+        from llm_cal.weight_analyzer import _QUANT_BPP
+        bpp = _QUANT_BPP.get(fp.scheme, 0.0)
+        if bpp <= 0 or total_params <= 0 or observed_bytes <= 0:
+            return True  # can't verify — don't penalize the fingerprint
+        predicted = bpp * total_params
+        rel_err = abs(observed_bytes - predicted) / predicted
+        return rel_err <= 0.15
+    @staticmethod
+    def _select_context_lengths(profile: ArchitectureProfile, override: int | None) -> list[int]:
+        if override is not None:
+            return [override]
+        candidates = [4_096, 32_768, 131_072]
+        max_pos = profile.position.max_position_embeddings if profile.position else None
+        if max_pos and max_pos > 131_072:
+            candidates.append(max_pos)
+        if max_pos:
+            candidates = [c for c in candidates if c <= max_pos]
+        return candidates
+def _generate_command(
+    engine: str,
+    model_id: str,
+    profile: ArchitectureProfile,
+    tp: int,
+    entry: EngineCompatEntry | None,
+    max_model_len: int | None,
+) -> str:
+    engine_norm = engine.lower().strip()
+    if engine_norm == "sglang":
+        return generate_sglang_command(model_id, profile, tp, entry, max_model_len=max_model_len)
+    return generate_vllm_command(model_id, profile, tp, entry, max_model_len=max_model_len)

src/llm_cal/core/explain.py ADDED Viewed

	@@ -0,0 +1,504 @@

+"""Full derivation traces for each non-trivial number in the report.
+This module is only invoked when the user passes `--explain`. It doesn't
+recompute anything — it reads the values that the main evaluator already
+produced and wraps them in a formatted explanation with formula, inputs,
+step-by-step computation, and primary source citation.
+Design rationale: the tool's core promise is deterministic, auditable
+output. `--explain` makes that auditability human-readable. A user can:
+  1. Read the explanation themselves
+  2. Paste it into an LLM and ask "does this math check out?"
+  3. Cross-reference docs/methodology.md for the primary source
+All three preserve determinism — the LLM is the user's tool, not ours.
+"""
+from __future__ import annotations
+import math
+from dataclasses import dataclass, field
+from llm_cal.core.evaluator import EvaluationReport
+@dataclass(frozen=True)
+class ExplainInput:
+    """One input variable to a formula."""
+    name: str
+    value: str  # pre-formatted for display
+    label: str  # e.g. "[verified]", "[estimated]"
+    note: str = ""  # optional disambiguation
+@dataclass(frozen=True)
+class ExplainEntry:
+    """A full derivation trace for one output number."""
+    heading: str  # localized section title, e.g. "KV cache @ 128K"
+    formula: str  # the formula, literally
+    inputs: list[ExplainInput] = field(default_factory=list)
+    steps: list[str] = field(default_factory=list)  # step-by-step computation
+    result: str = ""  # final formatted answer with label
+    source: str = ""  # primary source citation
+    methodology_anchor: str = ""  # anchor in docs/methodology.md, e.g. "#prefill-latency"
+def build(report: EvaluationReport) -> list[ExplainEntry]:
+    """Produce explanation entries in the order they appear in the main report."""
+    entries: list[ExplainEntry] = []
+    _weight_bytes(report, entries)
+    _quantization(report, entries)
+    _kv_cache_contexts(report, entries)
+    _fleet_tiers(report, entries)
+    _prefill(report, entries)
+    _decode(report, entries)
+    _concurrency(report, entries)
+    return entries
+# ======================================================================
+# Weight
+# ======================================================================
+def _weight_bytes(report: EvaluationReport, entries: list[ExplainEntry]) -> None:
+    w = report.weight.total_bytes
+    entries.append(
+        ExplainEntry(
+            heading="Weight bytes (safetensors file sum)",
+            formula="sum(sibling.size for sibling in HF model_info(files_metadata=True).siblings if sibling.endswith('.safetensors'))",
+            inputs=[
+                ExplainInput(
+                    name="HF model_info API",
+                    value=f"source={report.source}, sha={report.commit_sha or 'HEAD'}",
+                    label="[verified]",
+                ),
+            ],
+            steps=[
+                f"Raw value from API = {w.value:,} bytes",
+                f"= {w.value / 1e9:.2f} GB",
+            ],
+            result=f"{w.value:,} bytes [verified]",
+            source=w.source or "HF siblings API",
+            methodology_anchor="#weight-bytes",
+        )
+    )
+def _quantization(report: EvaluationReport, entries: list[ExplainEntry]) -> None:
+    r = report.reconciliation
+    if not r.candidates:
+        return
+    best = r.candidates[0]
+    cands_table = "\n".join(
+        f"      {c.scheme:<16} predicted={c.predicted_bytes / 1e9:.2f} GB  "
+        f"error={c.relative_error * 100:.1f}%"
+        for c in r.candidates[:6]
+    )
+    entries.append(
+        ExplainEntry(
+            heading="Quantization scheme (reconciliation)",
+            formula="best_match = argmin_scheme |observed_bytes - scheme.bpp × total_params|",
+            inputs=[
+                ExplainInput(
+                    name="observed_bytes",
+                    value=f"{r.observed_bytes:,}",
+                    label="[verified]",
+                ),
+                ExplainInput(
+                    name="total_params",
+                    value=f"{r.total_params:,}",
+                    label="[estimated]",
+                    note="from architecture formula — see '#params-estimate' entry below",
+                ),
+            ],
+            steps=[
+                "For each known quantization scheme, predict total bytes = bpp × params:",
+                cands_table,
+                f"Winner: {best.scheme} at {best.relative_error * 100:.1f}% error",
+            ],
+            result=f"{r.best.value} [{r.best.label.value}]",
+            source="Nearest-anchor match against known bytes-per-param values",
+            methodology_anchor="#quantization-scheme",
+        )
+    )
+# ======================================================================
+# KV cache
+# ======================================================================
+def _kv_cache_contexts(report: EvaluationReport, entries: list[ExplainEntry]) -> None:
+    profile = report.profile
+    attn = profile.attention
+    if attn is None:
+        return
+    is_mla = attn.variant == "MLA"
+    is_csa_hca = attn.variant == "CSA_HCA"
+    for ctx, av in report.kv_cache_by_context.items():
+        if av.value == 0:
+            continue
+        # Rebuild the computation for transparency
+        if is_mla and attn.kv_lora_rank:
+            per_tok_per_layer = attn.kv_lora_rank * 2  # kv_lora_rank × dtype(2)
+            formula = "per_tok_per_layer = kv_lora_rank × dtype_bytes   (MLA: compressed latent KV)"
+            inputs = [
+                ExplainInput("kv_lora_rank", str(attn.kv_lora_rank), "[verified]"),
+                ExplainInput("dtype_bytes", "2", "[verified]", note="BF16/FP16"),
+                ExplainInput("seq_len", f"{ctx:,}", "[verified]"),
+                ExplainInput("num_layers", str(profile.num_hidden_layers), "[verified]"),
+            ]
+        else:
+            per_tok_per_layer = 2 * attn.num_kv_heads * attn.head_dim * 2
+            formula = "per_tok_per_layer = 2 × num_kv_heads × head_dim × dtype_bytes   (standard attention)"
+            inputs = [
+                ExplainInput("num_kv_heads", str(attn.num_kv_heads), "[verified]"),
+                ExplainInput("head_dim", str(attn.head_dim), "[verified]"),
+                ExplainInput("dtype_bytes", "2", "[verified]", note="BF16/FP16"),
+                ExplainInput("seq_len", f"{ctx:,}", "[verified]"),
+                ExplainInput("num_layers", str(profile.num_hidden_layers), "[verified]"),
+            ]
+        baseline = per_tok_per_layer * ctx * profile.num_hidden_layers
+        steps = [
+            f"per_tok_per_layer = {per_tok_per_layer:,} bytes",
+            f"baseline = per_tok_per_layer × seq_len × num_layers = {baseline:,} bytes",
+        ]
+        if is_csa_hca and attn.compress_ratios:
+            ratios = attn.compress_ratios
+            avg = sum(1.0 if r == 0 else 1.0 / r for r in ratios) / len(ratios)
+            inputs.append(
+                ExplainInput(
+                    "compress_ratios",
+                    f"len={len(ratios)} (avg keep-fraction={avg:.4f})",
+                    "[verified]",
+                )
+            )
+            formula += (
+                "\napply_csa_hca: baseline × avg(1/r_i for r_i in compress_ratios, 0 = keep-all=1)"
+            )
+            steps.extend(
+                [
+                    f"avg_keep_fraction = {avg:.4f}",
+                    f"result = baseline × avg_keep_fraction = {av.value:,} bytes",
+                ]
+            )
+        else:
+            steps.append(f"result = baseline = {av.value:,} bytes")
+        entries.append(
+            ExplainEntry(
+                heading=f"KV cache @ {_fmt_ctx(ctx)} context",
+                formula=formula,
+                inputs=inputs,
+                steps=steps,
+                result=f"{av.value:,} bytes = {av.value / 1e9:.2f} GB [{av.label.value}]",
+                source=(
+                    "DeepSeek-V2 paper (MLA); DeepSeek-V4 tech report (CSA+HCA); "
+                    "standard attention formula per Attention Is All You Need (Vaswani 2017)"
+                ),
+                methodology_anchor="#kv-cache-per-request",
+            )
+        )
+# ======================================================================
+# Fleet tiers
+# ======================================================================
+def _fleet_tiers(report: EvaluationReport, entries: list[ExplainEntry]) -> None:
+    if report.fleet is None or report.gpu_spec is None:
+        return
+    # One explain block per tier (min / dev / prod)
+    for opt in report.fleet.options:
+        tier_label = opt.tier
+        headroom = opt.usable_bytes_per_gpu - opt.weight_bytes_per_gpu
+        steps = [
+            f"per-GPU HBM usable (@ 90% util) = {opt.usable_bytes_per_gpu:,} bytes",
+            f"weight per GPU = total_weight / TP_size = "
+            f"{report.weight.total_bytes.value:,} / {opt.gpu_count} = "
+            f"{opt.weight_bytes_per_gpu:,} bytes",
+            f"headroom per GPU = usable - weight = {headroom:,} bytes ({headroom / 1e9:.2f} GB)",
+        ]
+        fit_criterion = {"min": 1, "dev": 8, "prod": 16}.get(tier_label, 1)
+        steps.append(
+            f"tier criterion: headroom ≥ weight_per_gpu + {fit_criterion} × kv_per_request_128K"
+        )
+        steps.append(
+            f"smallest TP count in {list(report.fleet.valid_tp_sizes)} that "
+            f"satisfies the criterion: {opt.gpu_count}"
+        )
+        if not opt.fits:
+            steps.append(
+                f"NOTE: does not fit the criterion — the chosen {opt.gpu_count} "
+                "is the best available."
+            )
+        entries.append(
+            ExplainEntry(
+                heading=f"Fleet tier: {tier_label} ({opt.gpu_count} GPUs)",
+                formula=(
+                    "smallest TP in valid_set where "
+                    "weight_per_gpu + concurrent × kv_per_request ≤ usable_per_gpu"
+                ),
+                inputs=[
+                    ExplainInput(
+                        "total_weight_bytes",
+                        f"{report.weight.total_bytes.value:,}",
+                        "[verified]",
+                    ),
+                    ExplainInput(
+                        "valid_TP_sizes",
+                        str(list(report.fleet.valid_tp_sizes)),
+                        "[estimated]",
+                        note="divisors of num_attention_heads capped at 8 (single node)",
+                    ),
+                    ExplainInput(
+                        "GPU memory_gb",
+                        f"{report.gpu_spec.memory_gb} GB",
+                        "[verified]",
+                    ),
+                ],
+                steps=steps,
+                result=f"{opt.gpu_count} GPUs, fit={opt.fits}",
+                source="vLLM --gpu-memory-utilization 0.9 convention; TP divisibility required by vLLM/SGLang",
+                methodology_anchor="#tp-aware-kv-sharding",
+            )
+        )
+# ======================================================================
+# Prefill
+# ======================================================================
+def _prefill(report: EvaluationReport, entries: list[ExplainEntry]) -> None:
+    if (
+        report.prefill is None
+        or report.gpu_spec is None
+        or report.fleet is None
+        or report.perf_input_tokens is None
+    ):
+        return
+    p = report.prefill
+    # Figure out chosen GPU count from the fleet
+    chosen = next(
+        (o.gpu_count for o in report.fleet.options if o.tier == report.fleet.best_tier),
+        report.fleet.options[0].gpu_count,
+    )
+    entries.append(
+        ExplainEntry(
+            heading="Prefill latency (single request)",
+            formula=(
+                "FLOPs = 2 × params × input_tokens\n"
+                "effective_TFLOPS = peak_fp16_TFLOPS × num_gpus × utilization\n"
+                "latency_ms = (FLOPs / (effective_TFLOPS × 1e12)) × 1000"
+            ),
+            inputs=[
+                ExplainInput(
+                    "params",
+                    f"{report.total_params_estimate.value:,}",
+                    "[estimated]",
+                    note="from architecture formula (see weight.py)",
+                ),
+                ExplainInput("input_tokens", f"{report.perf_input_tokens:,}", "[user-set]"),
+                ExplainInput(
+                    "peak_fp16_TFLOPS",
+                    f"{report.gpu_spec.fp16_tflops}",
+                    "[verified]",
+                    note=f"from GPU database, {report.gpu_spec.id} spec",
+                ),
+                ExplainInput("num_gpus", f"{chosen}", "[estimated]"),
+                ExplainInput(
+                    "utilization",
+                    f"{p.utilization:.2f}",
+                    "[user-set]",
+                    note="empirical MFU, default 0.40 — override with --prefill-util",
+                ),
+            ],
+            steps=[
+                f"FLOPs = 2 × {report.total_params_estimate.value:,} × "
+                f"{report.perf_input_tokens:,} = {p.total_flops.value:.3e}",
+                f"effective_TFLOPS = {report.gpu_spec.fp16_tflops} × {chosen} × "
+                f"{p.utilization:.2f} = {p.peak_effective_tflops.value:.1f}",
+                f"latency = {p.total_flops.value:.3e} / "
+                f"({p.peak_effective_tflops.value:.1f} × 1e12) × 1000 = "
+                f"{p.latency_ms.value:.1f} ms",
+            ],
+            result=f"{p.latency_ms.value:.1f} ms [{p.latency_ms.label.value}]",
+            source="Kaplan et al. 2020 'Scaling Laws for Neural Language Models' (arxiv.org/abs/2001.08361)",
+            methodology_anchor="#prefill-latency",
+        )
+    )
+# ======================================================================
+# Decode
+# ======================================================================
+def _decode(report: EvaluationReport, entries: list[ExplainEntry]) -> None:
+    if report.decode is None or report.gpu_spec is None or report.fleet is None:
+        return
+    d = report.decode
+    bw = report.gpu_spec.memory_bandwidth_gbps or 0
+    chosen = next(
+        (o.gpu_count for o in report.fleet.options if o.tier == report.fleet.best_tier),
+        report.fleet.options[0].gpu_count,
+    )
+    weight_per_gpu = d.active_weight_bytes_per_gpu.value
+    effective_bw_gbs = bw * d.bw_utilization
+    steps = [
+        f"weight_per_gpu = {report.weight.total_bytes.value:,} / {chosen} = "
+        f"{weight_per_gpu:,} bytes ({weight_per_gpu / 1e9:.2f} GB)",
+        f"effective_bw = {bw} × {d.bw_utilization:.2f} = {effective_bw_gbs:.0f} GB/s",
+        f"per_gpu_tok_per_sec = effective_bw / weight_per_gpu = "
+        f"{effective_bw_gbs * 1e9 / weight_per_gpu:.1f} tok/s",
+        f"cluster_tok_per_sec = per_gpu × {chosen} × "
+        f"{d.cluster_comm_efficiency:.2f} = {d.cluster_tokens_per_sec.value:.1f} tok/s",
+    ]
+    entries.append(
+        ExplainEntry(
+            heading="Decode throughput (cluster)",
+            formula=(
+                "per_gpu_tok_per_sec = memory_bandwidth × bw_util / weight_bytes_per_gpu\n"
+                "cluster_tok_per_sec = per_gpu × num_gpus × cluster_comm_efficiency"
+            ),
+            inputs=[
+                ExplainInput(
+                    "GPU memory_bandwidth_gbps",
+                    f"{bw}",
+                    "[verified]",
+                    note=f"from GPU database, {report.gpu_spec.id}",
+                ),
+                ExplainInput(
+                    "bw_util",
+                    f"{d.bw_utilization:.2f}",
+                    "[user-set]",
+                    note="empirical, default 0.50 — override with --decode-bw-util",
+                ),
+                ExplainInput("weight_bytes_per_gpu", f"{weight_per_gpu:,}", "[estimated]"),
+                ExplainInput("num_gpus", f"{chosen}", "[estimated]"),
+                ExplainInput(
+                    "cluster_comm_efficiency",
+                    f"{d.cluster_comm_efficiency:.2f}",
+                    "[user-set]",
+                    note="NCCL AllReduce efficiency on NVLink, default 0.90",
+                ),
+            ],
+            steps=steps,
+            result=f"{d.cluster_tokens_per_sec.value:.1f} tok/s [estimated]",
+            source="vLLM paper (Kwon et al. SOSP 2023, arxiv.org/abs/2309.06180)",
+            methodology_anchor="#decode-tokens-per-second",
+        )
+    )
+# ======================================================================
+# Concurrency bounds
+# ======================================================================
+def _concurrency(report: EvaluationReport, entries: list[ExplainEntry]) -> None:
+    if report.concurrency is None:
+        return
+    c = report.concurrency
+    entries.append(
+        ExplainEntry(
+            heading="K bound (memory capacity)",
+            formula="K = floor(per_GPU_headroom_bytes / per_GPU_kv_bytes_per_request)",
+            inputs=[
+                ExplainInput(
+                    "per_GPU_headroom_bytes",
+                    f"{c.k_source_headroom_bytes:,}",
+                    "[estimated]",
+                ),
+                ExplainInput(
+                    "per_GPU_kv_bytes_per_request",
+                    f"{c.k_source_kv_per_req_bytes:,}",
+                    "[estimated]",
+                    note="post-TP-sharding via min(tp, num_kv_heads)",
+                ),
+            ],
+            steps=[
+                f"K = floor({c.k_source_headroom_bytes:,} / "
+                f"{c.k_source_kv_per_req_bytes:,}) = {c.k_bound.value}",
+            ],
+            result=f"K = {c.k_bound.value} [{c.k_bound.label.value}]",
+            source="TP sharding rule from vLLM source code (verified)",
+            methodology_anchor="#k-bound-memory-capacity",
+        )
+    )
+    l_tps = report.decode.cluster_tokens_per_sec.value if report.decode else 0
+    entries.append(
+        ExplainEntry(
+            heading="L bound (compute/bandwidth at SLA)",
+            formula=(
+                "L = floor(cluster_tok_per_sec / target_per_user_tok_per_sec / degradation_factor)"
+            ),
+            inputs=[
+                ExplainInput("cluster_tok_per_sec", f"{l_tps:.1f}", "[estimated]"),
+                ExplainInput(
+                    "target_per_user_tok_per_sec",
+                    f"{c.target_tokens_per_sec:.1f}",
+                    "[user-set]",
+                    note="SLA, override with --target-tokens-per-sec",
+                ),
+                ExplainInput(
+                    "degradation_factor",
+                    f"{c.degradation_factor:.2f}",
+                    "[user-set]",
+                    note="default 1.0 = no degradation; override with --concurrency-degradation",
+                ),
+            ],
+            steps=[
+                f"L = floor({l_tps:.1f} / {c.target_tokens_per_sec:.1f} / "
+                f"{c.degradation_factor:.2f}) = {c.l_bound.value}",
+            ],
+            result=f"L = {c.l_bound.value} [{c.l_bound.label.value}]",
+            source="Standard SLA-based capacity planning",
+            methodology_anchor="#l-bound-compute-bandwidth-at-sla",
+        )
+    )
+    entries.append(
+        ExplainEntry(
+            heading="Max concurrent + bottleneck verdict",
+            formula="max_concurrent = min(K, L); bottleneck = 'memory_capacity' if K ≤ L else 'memory_bandwidth / compute'",
+            inputs=[
+                ExplainInput("K", str(c.k_bound.value), f"[{c.k_bound.label.value}]"),
+                ExplainInput("L", str(c.l_bound.value), f"[{c.l_bound.label.value}]"),
+            ],
+            steps=[
+                f"max_concurrent = min(K={c.k_bound.value}, L={c.l_bound.value}) = "
+                f"{c.max_concurrent.value}",
+                f"bottleneck = {c.bottleneck}",
+            ],
+            result=(f"{c.max_concurrent.value} concurrent, bottleneck = {c.bottleneck}"),
+            source=c.bottleneck_reason_en,
+            methodology_anchor="#concurrency-bounds-k-l",
+        )
+    )
+    # Sanity check to silence "unused math import" if no steps triggered math.
+    _ = math.floor(0)
+# ======================================================================
+# Helpers
+# ======================================================================
+def _fmt_ctx(ctx: int) -> str:
+    if ctx >= 1_000_000:
+        return f"{ctx // 1_000_000}M"
+    if ctx >= 1024:
+        return f"{ctx // 1024}K"
+    return str(ctx)

src/llm_cal/engine_compat/__init__.py ADDED Viewed

File without changes

src/llm_cal/engine_compat/loader.py ADDED Viewed

	@@ -0,0 +1,118 @@

+"""Engine compatibility matrix loader + match function."""
+from __future__ import annotations
+from functools import lru_cache
+from importlib.resources import files
+from pathlib import Path
+from typing import Literal
+from packaging.specifiers import InvalidSpecifier, SpecifierSet
+from packaging.version import InvalidVersion, Version
+from pydantic import BaseModel, Field
+from llm_cal.common.yaml_loader import load_yaml
+SupportLevel = Literal["full", "partial", "broken", "unverified"]
+VerificationLevel = Literal["verified", "cited", "unverified"]
+class EngineFlag(BaseModel):
+    flag: str
+    value: str | None = None
+    note_en: str | None = None
+    note_zh: str | None = None
+class EngineSource(BaseModel):
+    type: str  # release_notes | announcement | pr | tested
+    url: str | None = None
+    captured_date: str | None = None
+    note_en: str | None = None
+    note_zh: str | None = None
+    # `tested` specific fields (may be absent on other types)
+    tester: str | None = None
+    date: str | None = None
+    hardware: str | None = None
+class EngineCompatEntry(BaseModel):
+    engine: Literal["vllm", "sglang"]
+    version_spec: str  # e.g. ">=0.19.0"
+    matches_model_type: str
+    support: SupportLevel
+    verification_level: VerificationLevel
+    required_flags: list[EngineFlag] = Field(default_factory=list)
+    optional_flags: list[EngineFlag] = Field(default_factory=list)
+    sources: list[EngineSource] = Field(default_factory=list)
+    caveats_en: list[str] = Field(default_factory=list)
+    caveats_zh: list[str] = Field(default_factory=list)
+class EngineCompatMatrix(BaseModel):
+    schema_version: int
+    entries: list[EngineCompatEntry]
+def _default_path() -> Path:
+    return Path(str(files("llm_cal.engine_compat").joinpath("matrix.yaml")))
+@lru_cache(maxsize=1)
+def load_matrix(path: Path | None = None) -> EngineCompatMatrix:
+    return load_yaml(path or _default_path(), EngineCompatMatrix)
+def find_match(
+    engine: str,
+    model_type: str,
+    version: str | None = None,
+    matrix: EngineCompatMatrix | None = None,
+) -> EngineCompatEntry | None:
+    """Find the highest-version matching entry for (engine, model_type).
+    If `version` is None, we return the broadest entry (any version matching
+    model_type on the given engine). If `version` is given, we filter to entries
+    whose version_spec covers it.
+    """
+    m = matrix or load_matrix()
+    engine_norm = engine.lower().strip()
+    model_type_norm = model_type.lower().strip()
+    candidates = [
+        e for e in m.entries if e.engine == engine_norm and e.matches_model_type == model_type_norm
+    ]
+    if not candidates:
+        return None
+    if version is None:
+        # Return the entry with the "highest lower bound" as the most relevant
+        return max(candidates, key=_lower_bound_key)
+    try:
+        v = Version(version)
+    except InvalidVersion:
+        return candidates[0]
+    for entry in candidates:
+        try:
+            if v in SpecifierSet(entry.version_spec):
+                return entry
+        except InvalidSpecifier:
+            continue
+    return None
+def _lower_bound_key(entry: EngineCompatEntry) -> Version:
+    """Extract the lowest version a spec matches (approximate, used only for sort)."""
+    try:
+        spec = SpecifierSet(entry.version_spec)
+    except InvalidSpecifier:
+        return Version("0.0.0")
+    for single in spec:
+        if single.operator in (">=", "==", ">"):
+            try:
+                return Version(single.version)
+            except InvalidVersion:
+                continue
+    return Version("0.0.0")

src/llm_cal/engine_compat/matrix.yaml ADDED Viewed

	@@ -0,0 +1,512 @@

+# Engine compatibility matrix — v0.1 initial entries.
+#
+# VERIFICATION LEVELS:
+#   verified   = actually tested by someone with hardware (requires type=tested source)
+#                  >>> v0.1 has ZERO `verified` entries — author has no test hardware <<<
+#   cited      = evidence exists (release note / PR / announcement) but not tested by us
+#   unverified = no sources, just an educated guess
+#
+# The tool ALWAYS surfaces verification_level in output. Users never see a green
+# checkmark on an unverified row.
+schema_version: 2
+entries:
+  # ============================================================
+  # vLLM
+  # ============================================================
+  - engine: vllm
+    version_spec: ">=0.19.0"
+    matches_model_type: deepseek_v4
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags:
+      - flag: "--attention-backend"
+        value: "auto"
+        note_en: "Picks CSA+HCA backend automatically."
+        note_zh: "自动选择 CSA+HCA 注意力后端。"
+    sources:
+      - type: release_notes
+        url: "https://github.com/vllm-project/vllm/releases/tag/v0.19.0"
+        captured_date: "2026-04-23"
+      - type: announcement
+        url: "https://x.com/vllm_project/status/2047520252851105796"
+        captured_date: "2026-04-23"
+        note_en: "Day-0 support announcement."
+        note_zh: "Day-0 支持公告。"
+    caveats_en:
+      - "H800 MoE all-to-all is bottlenecked by halved NVLink; throughput lower than H100."
+      - "1M context requires --max-model-len 1048576 + --gpu-memory-utilization 0.9."
+    caveats_zh:
+      - "H800 的 MoE all-to-all 受限于减半的 NVLink，吞吐明显低于 H100。"
+      - "1M 上下文需要 --max-model-len 1048576 + --gpu-memory-utilization 0.9。"
+  - engine: vllm
+    version_spec: ">=0.18.0,<0.19.0"
+    matches_model_type: deepseek_v3_2
+    support: full
+    verification_level: cited
+    required_flags:
+      - flag: "--attention-backend"
+        value: "nsa"
+    optional_flags: []
+    sources:
+      - type: release_notes
+        url: "https://docs.vllm.ai/projects/recipes/en/latest/DeepSeek/DeepSeek-V3_2.html"
+        captured_date: "2026-04-24"
+    caveats_en:
+      - "TP=8 padding overhead: 128 attention heads / 8 = 16 per rank but padded to 64."
+    caveats_zh:
+      - "TP=8 存在 padding 开销：128 个头 / 8 = 16 头/rank，但填充到 64。建议 TP=2 + DP/EP。"
+  - engine: vllm
+    version_spec: ">=0.7.0"
+    matches_model_type: deepseek_v3
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags:
+      - flag: "--trust-remote-code"
+        value: null
+        note_en: "Required for custom DeepSeek modeling code."
+        note_zh: "用于加载 DeepSeek 的自定义建模代码。"
+    sources:
+      - type: release_notes
+        url: "https://docs.vllm.ai/projects/recipes/en/latest/DeepSeek/DeepSeek-V3.html"
+        captured_date: "2026-04-24"
+    caveats_en: []
+    caveats_zh: []
+  - engine: vllm
+    version_spec: ">=0.6.0"
+    matches_model_type: llama
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags: []
+    sources:
+      - type: release_notes
+        url: "https://docs.vllm.ai/"
+        captured_date: "2026-04-24"
+    caveats_en: []
+    caveats_zh: []
+  - engine: vllm
+    version_spec: ">=0.7.0"
+    matches_model_type: qwen3
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags: []
+    sources:
+      - type: release_notes
+        url: "https://docs.vllm.ai/"
+        captured_date: "2026-04-24"
+    caveats_en: []
+    caveats_zh: []
+  - engine: vllm
+    version_spec: ">=0.7.0"
+    matches_model_type: qwen3_moe
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags:
+      - flag: "--enable-expert-parallel"
+        value: null
+        note_en: "Enables DP+EP for MoE all-to-all distribution."
+        note_zh: "启用 DP+EP，对 MoE all-to-all 通信更友好。"
+    sources:
+      - type: release_notes
+        url: "https://docs.vllm.ai/en/latest/serving/expert_parallel_deployment/"
+        captured_date: "2026-04-24"
+    caveats_en: []
+    caveats_zh: []
+  - engine: vllm
+    version_spec: ">=0.6.0"
+    matches_model_type: mixtral
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags: []
+    sources:
+      - type: release_notes
+        url: "https://docs.vllm.ai/"
+        captured_date: "2026-04-24"
+    caveats_en: []
+    caveats_zh: []
+  - engine: vllm
+    version_spec: ">=0.6.0"
+    matches_model_type: mistral
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags: []
+    sources:
+      - type: release_notes
+        url: "https://docs.vllm.ai/"
+        captured_date: "2026-04-24"
+    caveats_en: []
+    caveats_zh: []
+  - engine: vllm
+    version_spec: ">=0.6.0"
+    matches_model_type: qwen2
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags: []
+    sources:
+      - type: release_notes
+        url: "https://docs.vllm.ai/en/latest/models/supported_models.html"
+        captured_date: "2026-04-24"
+    caveats_en: []
+    caveats_zh: []
+  - engine: vllm
+    version_spec: ">=0.6.0"
+    matches_model_type: qwen2_moe
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags:
+      - flag: "--enable-expert-parallel"
+        value: null
+        note_en: "Enables DP+EP for MoE all-to-all distribution."
+        note_zh: "启用 DP+EP，对 MoE all-to-all 通信更友好。"
+    sources:
+      - type: release_notes
+        url: "https://docs.vllm.ai/en/latest/serving/expert_parallel_deployment/"
+        captured_date: "2026-04-24"
+    caveats_en: []
+    caveats_zh: []
+  - engine: vllm
+    version_spec: ">=0.5.0"
+    matches_model_type: gemma
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags: []
+    sources:
+      - type: release_notes
+        url: "https://docs.vllm.ai/en/latest/models/supported_models.html"
+        captured_date: "2026-04-24"
+    caveats_en:
+      - "Gemma uses tied embeddings — output head shares embedding weights."
+    caveats_zh:
+      - "Gemma 使用权重绑定的 embedding（tied embeddings），输出头与 embedding 共享权重。"
+  - engine: vllm
+    version_spec: ">=0.6.0"
+    matches_model_type: gemma2
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags: []
+    sources:
+      - type: release_notes
+        url: "https://docs.vllm.ai/en/latest/models/supported_models.html"
+        captured_date: "2026-04-24"
+    caveats_en: []
+    caveats_zh: []
+  - engine: vllm
+    version_spec: ">=0.7.0"
+    matches_model_type: gemma3
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags: []
+    sources:
+      - type: release_notes
+        url: "https://docs.vllm.ai/en/latest/models/supported_models.html"
+        captured_date: "2026-04-24"
+    caveats_en:
+      - "Gemma 3 adds vision modality — v0.1 of llm-cal treats it as text-only for now."
+    caveats_zh:
+      - "Gemma 3 新增视觉多模态能力，llm-cal v0.1 当作纯文本模型处理。"
+  - engine: vllm
+    version_spec: ">=0.5.0"
+    matches_model_type: phi
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags: []
+    sources:
+      - type: release_notes
+        url: "https://docs.vllm.ai/en/latest/models/supported_models.html"
+        captured_date: "2026-04-24"
+    caveats_en: []
+    caveats_zh: []
+  - engine: vllm
+    version_spec: ">=0.5.0"
+    matches_model_type: phi3
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags: []
+    sources:
+      - type: release_notes
+        url: "https://docs.vllm.ai/en/latest/models/supported_models.html"
+        captured_date: "2026-04-24"
+    caveats_en: []
+    caveats_zh: []
+  - engine: vllm
+    version_spec: ">=0.6.0"
+    matches_model_type: deepseek_v2
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags:
+      - flag: "--trust-remote-code"
+        value: null
+        note_en: "Required for DeepSeek V2 custom modeling code."
+        note_zh: "加载 DeepSeek V2 的自定义建模代码。"
+    sources:
+      - type: release_notes
+        url: "https://docs.vllm.ai/en/latest/models/supported_models.html"
+        captured_date: "2026-04-24"
+    caveats_en: []
+    caveats_zh: []
+  # ============================================================
+  # SGLang
+  # ============================================================
+  - engine: sglang
+    version_spec: ">=0.5.0"
+    matches_model_type: deepseek_v3_2
+    support: full
+    verification_level: cited
+    required_flags:
+      - flag: "--attention-backend"
+        value: "nsa"
+    optional_flags: []
+    sources:
+      - type: release_notes
+        url: "https://docs.sglang.io/advanced_features/attention_backend.html"
+        captured_date: "2026-04-24"
+      - type: announcement
+        url: "https://www.lmsys.org/blog/2025-09-29-deepseek-V32/"
+        captured_date: "2025-09-29"
+        note_en: "Day-0 V3.2 support announcement."
+        note_zh: "V3.2 的 Day-0 支持公告。"
+    caveats_en: []
+    caveats_zh: []
+  - engine: sglang
+    version_spec: ">=0.5.0"
+    matches_model_type: deepseek_v4
+    support: unverified
+    verification_level: unverified
+    required_flags: []
+    optional_flags: []
+    sources: []
+    caveats_en:
+      - "As of 2026-04-24, no Day-0 announcement for V4. DSA/NSA infrastructure exists (V3.2), expected to extend."
+    caveats_zh:
+      - "截至 2026-04-24，尚无 V4 的 Day-0 公告。已有 V3.2 的 DSA/NSA 基础设施，预期会扩展支持。"
+  - engine: sglang
+    version_spec: ">=0.4.0"
+    matches_model_type: deepseek_v3
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags: []
+    sources:
+      - type: release_notes
+        url: "https://github.com/sgl-project/sglang"
+        captured_date: "2026-04-24"
+    caveats_en: []
+    caveats_zh: []
+  - engine: sglang
+    version_spec: ">=0.4.0"
+    matches_model_type: llama
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags: []
+    sources:
+      - type: release_notes
+        url: "https://github.com/sgl-project/sglang"
+        captured_date: "2026-04-24"
+    caveats_en: []
+    caveats_zh: []
+  - engine: sglang
+    version_spec: ">=0.4.0"
+    matches_model_type: qwen3
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags: []
+    sources:
+      - type: release_notes
+        url: "https://docs.sglang.io/basic_usage/popular_model_usage.html"
+        captured_date: "2026-04-24"
+    caveats_en: []
+    caveats_zh: []
+  - engine: sglang
+    version_spec: ">=0.4.0"
+    matches_model_type: mixtral
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags: []
+    sources:
+      - type: release_notes
+        url: "https://github.com/sgl-project/sglang"
+        captured_date: "2026-04-24"
+    caveats_en: []
+    caveats_zh: []
+  - engine: sglang
+    version_spec: ">=0.4.0"
+    matches_model_type: qwen2
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags: []
+    sources:
+      - type: release_notes
+        url: "https://docs.sglang.io/basic_usage/popular_model_usage.html"
+        captured_date: "2026-04-24"
+    caveats_en: []
+    caveats_zh: []
+  - engine: sglang
+    version_spec: ">=0.4.0"
+    matches_model_type: qwen2_moe
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags: []
+    sources:
+      - type: release_notes
+        url: "https://docs.sglang.io/basic_usage/popular_model_usage.html"
+        captured_date: "2026-04-24"
+    caveats_en: []
+    caveats_zh: []
+  - engine: sglang
+    version_spec: ">=0.4.0"
+    matches_model_type: qwen3_moe
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags: []
+    sources:
+      - type: release_notes
+        url: "https://docs.sglang.io/basic_usage/popular_model_usage.html"
+        captured_date: "2026-04-24"
+    caveats_en: []
+    caveats_zh: []
+  - engine: sglang
+    version_spec: ">=0.4.0"
+    matches_model_type: mistral
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags: []
+    sources:
+      - type: release_notes
+        url: "https://github.com/sgl-project/sglang"
+        captured_date: "2026-04-24"
+    caveats_en: []
+    caveats_zh: []
+  - engine: sglang
+    version_spec: ">=0.4.0"
+    matches_model_type: gemma
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags: []
+    sources:
+      - type: release_notes
+        url: "https://docs.sglang.io/basic_usage/popular_model_usage.html"
+        captured_date: "2026-04-24"
+    caveats_en: []
+    caveats_zh: []
+  - engine: sglang
+    version_spec: ">=0.4.0"
+    matches_model_type: gemma2
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags: []
+    sources:
+      - type: release_notes
+        url: "https://docs.sglang.io/basic_usage/popular_model_usage.html"
+        captured_date: "2026-04-24"
+    caveats_en: []
+    caveats_zh: []
+  - engine: sglang
+    version_spec: ">=0.5.0"
+    matches_model_type: gemma3
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags: []
+    sources:
+      - type: release_notes
+        url: "https://docs.sglang.io/basic_usage/popular_model_usage.html"
+        captured_date: "2026-04-24"
+    caveats_en: []
+    caveats_zh: []
+  - engine: sglang
+    version_spec: ">=0.4.0"
+    matches_model_type: phi
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags: []
+    sources:
+      - type: release_notes
+        url: "https://docs.sglang.io/basic_usage/popular_model_usage.html"
+        captured_date: "2026-04-24"
+    caveats_en: []
+    caveats_zh: []
+  - engine: sglang
+    version_spec: ">=0.4.0"
+    matches_model_type: phi3
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags: []
+    sources:
+      - type: release_notes
+        url: "https://docs.sglang.io/basic_usage/popular_model_usage.html"
+        captured_date: "2026-04-24"
+    caveats_en: []
+    caveats_zh: []
+  - engine: sglang
+    version_spec: ">=0.4.0"
+    matches_model_type: deepseek_v2
+    support: full
+    verification_level: cited
+    required_flags: []
+    optional_flags:
+      - flag: "--trust-remote-code"
+        value: null
+        note_en: "Required for DeepSeek V2 custom modeling code."
+        note_zh: "加载 DeepSeek V2 的自定义建模代码。"
+    sources:
+      - type: release_notes
+        url: "https://github.com/sgl-project/sglang"
+        captured_date: "2026-04-24"
+    caveats_en: []
+    caveats_zh: []

src/llm_cal/fleet/__init__.py ADDED Viewed

File without changes

src/llm_cal/fleet/planner.py ADDED Viewed

	@@ -0,0 +1,282 @@

+"""Fleet planner — reverse-inference of "how many GPUs do I need".
+Three tiers:
+  * min  — just enough to hold weights + light overhead
+           (can run single requests at short context)
+  * dev  — room for ~8 concurrent at 128K context
+  * prod — room for ~16 concurrent at 128K context
+TP-divisibility constraint (CRITICAL regression test): the number of attention
+heads must be divisible by the number of GPUs. vLLM/SGLang with TP=3 on a
+64-head model would fail to start; we only recommend counts in the valid set.
+Reserved overhead per GPU = 10% of HBM (CUDA context + activations + framework),
+which matches `--gpu-memory-utilization 0.9` in vLLM.
+Per-GPU KV modeling is TP-aware:
+  per_gpu_KV = total_KV / min(tp_size, max(1, num_kv_heads))
+  * MQA (kv_heads=1): KV replicates fully across ranks → divisor is 1,
+    per-GPU KV = total (accurate for DeepSeek V4-Flash, Qwen MQA variants).
+  * GQA (kv_heads=8): KV splits across ranks up to num_kv_heads → at TP=8,
+    per-GPU KV = total/8 (accurate for Llama 3 70B, Qwen 72B).
+  * MHA: splits fully up to num_heads.
+This matches vLLM/SGLang's actual sharding behavior. MLA-latent KV is
+technically replicated in most frameworks, but since num_kv_heads is
+typically 1 in MLA (DeepSeek V2/V3/V4), the formula degenerates to
+replication anyway.
+"""
+from __future__ import annotations
+import math
+from dataclasses import dataclass
+from typing import Literal
+from llm_cal.architecture.profile import ArchitectureProfile
+from llm_cal.hardware.loader import GPUSpec
+Tier = Literal["min", "dev", "prod"]
+_OVERHEAD_FRACTION = 0.10
+_KV_HEAD_ROOM_CONCURRENT: dict[Tier, int] = {
+    "min": 1,  # one request worth of KV at 128K
+    "dev": 8,
+    "prod": 16,
+}
+# For recommendation logic, compute per-GPU fit at this reference context length.
+_REFERENCE_CTX_TOKENS = 131_072
+# Max recommended TP within a single 8-GPU node. Beyond this we'd want PP/EP,
+# which is out of v0.1 scope.
+_MAX_TP_SINGLE_NODE = 8
+@dataclass(frozen=True)
+class FleetOption:
+    tier: Tier
+    gpu_count: int
+    weight_bytes_per_gpu: int
+    kv_bytes_per_request: int  # at reference context (128K)
+    max_concurrent_at_reference_ctx: int
+    # concurrency ceiling at each context length the user asked about.
+    # Key = context token count, value = max concurrent requests that fit.
+    max_concurrent_by_context: tuple[tuple[int, int], ...]
+    usable_bytes_per_gpu: int
+    fits: bool  # False => the best we can do still overflows headroom at reference ctx
+    reason_en: str
+    reason_zh: str
+@dataclass(frozen=True)
+class FleetRecommendation:
+    options: tuple[FleetOption, ...]
+    best_tier: Tier
+    valid_tp_sizes: tuple[int, ...]
+    constraint_note_en: str
+    constraint_note_zh: str
+def plan(
+    profile: ArchitectureProfile,
+    weight_bytes: int,
+    kv_bytes_per_request_at_ref: int,
+    gpu: GPUSpec,
+    forced_gpu_count: int | None = None,
+    kv_bytes_by_context: dict[int, int] | None = None,
+) -> FleetRecommendation:
+    """Recommend GPU counts for the three tiers, or a single option when forced.
+    `kv_bytes_by_context` is optional metadata used only for the per-option
+    concurrency breakdown (e.g. "~23 concurrent @ 128K, ~2 @ 1M"). Tier-fit
+    decisions still use `kv_bytes_per_request_at_ref` (the reference context).
+    """
+    kv_by_ctx = kv_bytes_by_context or {}
+    bytes_per_gpu_total = gpu.memory_gb * 1_000_000_000
+    usable_per_gpu = int(bytes_per_gpu_total * (1 - _OVERHEAD_FRACTION))
+    valid_tp = _valid_tp_sizes(profile)
+    constraint_en = _constraint_note_en(profile, valid_tp)
+    constraint_zh = _constraint_note_zh(profile, valid_tp)
+    if forced_gpu_count is not None:
+        option = _evaluate_count(
+            forced_gpu_count,
+            profile=profile,
+            weight_bytes=weight_bytes,
+            kv_bytes=kv_bytes_per_request_at_ref,
+            usable_per_gpu=usable_per_gpu,
+            valid_tp=valid_tp,
+            tier="dev",  # generic label when user forced
+            kv_by_context=kv_by_ctx,
+        )
+        return FleetRecommendation(
+            options=(option,),
+            best_tier="dev",
+            valid_tp_sizes=tuple(valid_tp),
+            constraint_note_en=constraint_en,
+            constraint_note_zh=constraint_zh,
+        )
+    options: list[FleetOption] = []
+    for tier in ("min", "dev", "prod"):
+        gpu_count = _smallest_fitting_count(
+            valid_tp,
+            profile=profile,
+            weight_bytes=weight_bytes,
+            kv_bytes=kv_bytes_per_request_at_ref,
+            usable_per_gpu=usable_per_gpu,
+            concurrent=_KV_HEAD_ROOM_CONCURRENT[tier],
+        )
+        # Fall back to the largest TP if nothing fits — flagged as `fits=False`.
+        chosen = gpu_count if gpu_count is not None else max(valid_tp)
+        option = _evaluate_count(
+            chosen,
+            profile=profile,
+            weight_bytes=weight_bytes,
+            kv_bytes=kv_bytes_per_request_at_ref,
+            usable_per_gpu=usable_per_gpu,
+            valid_tp=valid_tp,
+            tier=tier,
+            kv_by_context=kv_by_ctx,
+        )
+        options.append(option)
+    # Best tier: dev if it fits, otherwise min, otherwise whatever exists
+    best = "dev" if options[1].fits else ("min" if options[0].fits else "prod")
+    return FleetRecommendation(
+        options=tuple(options),
+        best_tier=best,  # type: ignore[arg-type]
+        valid_tp_sizes=tuple(valid_tp),
+        constraint_note_en=constraint_en,
+        constraint_note_zh=constraint_zh,
+    )
+def _valid_tp_sizes(profile: ArchitectureProfile) -> list[int]:
+    """Divisors of num_heads, capped at the single-node maximum."""
+    if profile.attention is None or profile.attention.num_heads <= 0:
+        return [1]
+    h = profile.attention.num_heads
+    divisors = [i for i in range(1, min(h, _MAX_TP_SINGLE_NODE) + 1) if h % i == 0]
+    return divisors or [1]
+def _kv_shards(profile: ArchitectureProfile, tp_size: int) -> int:
+    """How many ways KV cache can be split across TP ranks.
+    Saturates at num_kv_heads: once tp_size > num_kv_heads, extra ranks
+    just replicate, so the divisor stops growing.
+    """
+    if profile.attention is None:
+        return 1
+    kv_heads = max(1, profile.attention.num_kv_heads)
+    return min(tp_size, kv_heads)
+def _smallest_fitting_count(
+    valid_tp: list[int],
+    *,
+    profile: ArchitectureProfile,
+    weight_bytes: int,
+    kv_bytes: int,
+    usable_per_gpu: int,
+    concurrent: int,
+) -> int | None:
+    for n in valid_tp:
+        if _fits(n, profile, weight_bytes, kv_bytes, usable_per_gpu, concurrent):
+            return n
+    return None
+def _fits(
+    gpu_count: int,
+    profile: ArchitectureProfile,
+    weight_bytes: int,
+    kv_bytes: int,
+    usable_per_gpu: int,
+    concurrent: int,
+) -> bool:
+    weight_per_gpu = math.ceil(weight_bytes / gpu_count)
+    shards = _kv_shards(profile, gpu_count)
+    kv_per_gpu = math.ceil(kv_bytes / shards)
+    needed = weight_per_gpu + concurrent * kv_per_gpu
+    return needed <= usable_per_gpu
+def _evaluate_count(
+    gpu_count: int,
+    *,
+    profile: ArchitectureProfile,
+    weight_bytes: int,
+    kv_bytes: int,
+    usable_per_gpu: int,
+    valid_tp: list[int],
+    tier: Tier,
+    kv_by_context: dict[int, int],
+) -> FleetOption:
+    weight_per_gpu = math.ceil(weight_bytes / gpu_count)
+    shards = _kv_shards(profile, gpu_count)
+    kv_per_gpu = math.ceil(kv_bytes / shards)
+    headroom = usable_per_gpu - weight_per_gpu
+    max_concurrent = max(0, headroom // kv_per_gpu) if kv_per_gpu > 0 else 0
+    # Per-context concurrency, sorted by context length ascending, each using
+    # the TP-sharded per-GPU KV.
+    max_concurrent_by_ctx = tuple(
+        (
+            ctx,
+            (max(0, headroom // math.ceil(kv / shards)) if kv > 0 else 0),
+        )
+        for ctx, kv in sorted(kv_by_context.items())
+    )
+    fits = _fits(
+        gpu_count,
+        profile,
+        weight_bytes,
+        kv_bytes,
+        usable_per_gpu,
+        _KV_HEAD_ROOM_CONCURRENT[tier],
+    )
+    # Reason strings
+    if gpu_count not in valid_tp:
+        reason_en = (
+            f"GPU count {gpu_count} does not divide num_heads — valid TP sizes: {sorted(valid_tp)}"
+        )
+        reason_zh = f"GPU 张数 {gpu_count} 无法整除注意力头数——有效 TP 张数：{sorted(valid_tp)}"
+    elif not fits:
+        reason_en = (
+            f"Weights + {_KV_HEAD_ROOM_CONCURRENT[tier]}x KV would exceed "
+            f"{usable_per_gpu / 1e9:.1f} GB usable per GPU"
+        )
+        reason_zh = (
+            f"权重 + {_KV_HEAD_ROOM_CONCURRENT[tier]} 份 KV 超过单卡可用的 "
+            f"{usable_per_gpu / 1e9:.1f} GB"
+        )
+    else:
+        reason_en = f"fits ~{max_concurrent} concurrent @ {_REFERENCE_CTX_TOKENS // 1024}K ctx"
+        reason_zh = f"可容纳约 {max_concurrent} 并发请求 @ {_REFERENCE_CTX_TOKENS // 1024}K 上下文"
+    return FleetOption(
+        tier=tier,
+        gpu_count=gpu_count,
+        weight_bytes_per_gpu=weight_per_gpu,
+        kv_bytes_per_request=kv_bytes,
+        max_concurrent_at_reference_ctx=max_concurrent,
+        max_concurrent_by_context=max_concurrent_by_ctx,
+        usable_bytes_per_gpu=usable_per_gpu,
+        fits=fits,
+        reason_en=reason_en,
+        reason_zh=reason_zh,
+    )
+def _constraint_note_en(profile: ArchitectureProfile, valid_tp: list[int]) -> str:
+    heads = profile.attention.num_heads if profile.attention else 0
+    return f"TP must divide num_heads={heads}. Candidates within one node (<=8 GPUs): {valid_tp}."
+def _constraint_note_zh(profile: ArchitectureProfile, valid_tp: list[int]) -> str:
+    heads = profile.attention.num_heads if profile.attention else 0
+    return f"TP 张数必须整除 num_heads={heads}。单节点（≤8 卡）候选：{valid_tp}。"

src/llm_cal/hardware/__init__.py ADDED Viewed

File without changes

src/llm_cal/hardware/gpu_database.yaml ADDED Viewed

	@@ -0,0 +1,613 @@

+# GPU database — v0.1.
+#
+# DATA PROVENANCE:
+# Numeric specs (memory_gb, nvlink_bandwidth_gbps, fp16_tflops, fp8/fp4_support)
+# come from public vendor datasheets and commonly-cited benchmarks. Each entry
+# records its source in `spec_source` so users can audit.
+#
+# Conventions:
+# - memory_gb: per-card HBM / GDDR in GB (vendor nominal)
+# - nvlink_bandwidth_gbps: aggregate NVLink (or equivalent like xGMI/HCCS)
+#   bandwidth. 0 if the GPU has no high-bandwidth interconnect (e.g. consumer
+#   Ada removed NVLink).
+# - fp16_tflops: peak dense FP16/BF16 with Tensor Cores; vendor cited figure.
+# - fp8_support / fp4_support: whether the GPU has NATIVE Tensor Core
+#   acceleration for that precision. Software emulation does NOT count.
+#
+# To add a new GPU: append an entry with all required fields + spec_source.
+# See docs/architecture-guide.md "How to add a new GPU".
+schema_version: 1
+gpus:
+  # ========================================================================
+  # NVIDIA Blackwell (2024+) — native FP4
+  # ========================================================================
+  - id: B200
+    aliases: [B200-SXM, B200-192G]
+    memory_gb: 192
+    nvlink_bandwidth_gbps: 1800
+    memory_bandwidth_gbps: 8000
+    fp16_tflops: 2250
+    fp8_support: true
+    fp4_support: true
+    spec_source: "NVIDIA Blackwell architecture overview (nvidia.com/blackwell)"
+    notes_en: "Blackwell flagship. Native FP4 Tensor Cores. First GPU that accelerates DeepSeek-V4-Flash-style FP4 at hardware level."
+    notes_zh: "Blackwell 旗舰。原生 FP4 Tensor Core，首款在硬件层加速 DeepSeek-V4-Flash 类 FP4 模型的 GPU。"
+  # ========================================================================
+  # NVIDIA Hopper (2022+)
+  # ========================================================================
+  - id: H100
+    aliases: [H100-SXM5, H100-80G, H100-SXM]
+    memory_gb: 80
+    nvlink_bandwidth_gbps: 900
+    memory_bandwidth_gbps: 3350
+    fp16_tflops: 989
+    fp8_support: true
+    fp4_support: false
+    spec_source: "NVIDIA H100 datasheet (nvidia.com/h100)"
+    notes_en: "Hopper flagship. Full NVLink."
+    notes_zh: "Hopper 架构旗舰，完整 NVLink 带宽。"
+  - id: H800
+    aliases: [H800-SXM5, H800-80G]
+    memory_gb: 80
+    nvlink_bandwidth_gbps: 400
+    memory_bandwidth_gbps: 3350
+    fp16_tflops: 989
+    fp8_support: true
+    fp4_support: false
+    spec_source: "NVIDIA H800 compliance variant — NVLink halved from H100 per US export controls"
+    notes_en: "China-regulated H100 variant. NVLink bandwidth halved (400 vs 900). Same HBM and compute as H100."
+    notes_zh: "H100 的中国合规版本。NVLink 带宽减半（400 vs 900 GB/s），HBM 容量和算力与 H100 相同。"
+  - id: H200
+    aliases: [H200-SXM, H200-141G]
+    memory_gb: 141
+    nvlink_bandwidth_gbps: 900
+    memory_bandwidth_gbps: 4800
+    fp16_tflops: 989
+    fp8_support: true
+    fp4_support: false
+    spec_source: "NVIDIA H200 datasheet (nvidia.com/h200)"
+    notes_en: "Hopper with HBM3e. 141 GB per GPU."
+    notes_zh: "搭载 HBM3e 的 Hopper，单卡 141 GB。"
+  - id: GH200
+    aliases: [Grace-Hopper, GH200-144G, GH200-96G]
+    memory_gb: 144
+    nvlink_bandwidth_gbps: 900
+    memory_bandwidth_gbps: 4800
+    fp16_tflops: 989
+    fp8_support: true
+    fp4_support: false
+    spec_source: "NVIDIA GH200 Grace Hopper datasheet 2023 (144GB HBM3e variant, dense FP16=989 TFLOPS; sparsity doubles it)"
+    notes_en: "Grace Hopper superchip — Hopper GPU + Grace CPU on one module. 144 GB HBM3e (96 GB HBM3 variant also exists). NVLink-C2C 900 GB/s CPU<->GPU unified. TDP programmable 450-1000W. Ideal for models that spill beyond single GPU memory because GPU can access CPU LPDDR coherently."
+    notes_zh: "Grace Hopper 超级芯片 — Hopper GPU + Grace CPU 融合模组。144 GB HBM3e（另有 96 GB HBM3 版本）。NVLink-C2C 让 CPU/GPU 共享统一内存空间，900 GB/s 双向。TDP 可编程 450-1000W。模型单卡显存装不下时，可一致地访问 CPU 的 LPDDR。"
+  - id: GB200
+    aliases: [Grace-Blackwell, GB200-per-GPU]
+    memory_gb: 192
+    nvlink_bandwidth_gbps: 1800
+    memory_bandwidth_gbps: 8000
+    fp16_tflops: 2250
+    fp8_support: true
+    fp4_support: true
+    spec_source: "NVIDIA GB200 Superchip datasheet 2024 — per-GPU view. Each GB200 = 2 B200 + Grace CPU. Per B200: 192 GB HBM3e, 8 TB/s, 2250 TFLOPS dense FP16 (4500 sparsity). Grace CPU adds up to 480 GB LPDDR5x accessible via NVLink-C2C."
+    notes_en: "Grace Blackwell superchip — 2 B200 GPUs + Grace CPU on one module. Per-GPU specs here match B200, but each GB200 module unlocks 384 GB HBM3e total (192+192) plus coherent access to 480 GB Grace CPU LPDDR5x. FP4 native. Only deployable in NVL4/NVL72 rack-scale systems with liquid cooling. Per-GPU TDP 1200W."
+    notes_zh: "Grace Blackwell 超级芯片 — 双 B200 GPU + Grace CPU 融合。此处展示单 GPU 视角规格，与 B200 基本一致。每块 GB200 模组合计 384 GB HBM3e（双卡），并通过 NVLink-C2C 一致访问 480 GB Grace CPU 的 LPDDR5x。原生 FP4。仅在 NVL4 / NVL72 液冷机架系统中部署。单 GPU TDP 1200W。"
+  - id: H20
+    aliases: [H20-96G, H20-SXM]
+    memory_gb: 96
+    nvlink_bandwidth_gbps: 900
+    memory_bandwidth_gbps: 4000
+    fp16_tflops: 148
+    fp8_support: true
+    fp4_support: false
+    spec_source: "NVIDIA H20 — released 2024 as China-compliant successor to H800. Compute heavily reduced (~15% of H100); memory bandwidth and HBM3e preserved."
+    notes_en: "China-compliance Hopper post-Oct-2023 export rules. Compute ~15% of H100 (148 vs 989 TFLOPS), but HBM3e memory bandwidth preserved. Good for memory-bound LLM inference, poor for training."
+    notes_zh: "2023 年 10 月出口管制后的中国合规 Hopper。算力仅为 H100 的约 15%（148 vs 989 TFLOPS），但 HBM3e 显存带宽保留。推理（显存带宽受限）尚可，训练基本不实用。"
+  # ========================================================================
+  # NVIDIA Ada Lovelace (datacenter) — FP8 yes, NVLink no
+  # ========================================================================
+  - id: L40S
+    aliases: [L40-S, L40S-48G]
+    memory_gb: 48
+    nvlink_bandwidth_gbps: 0
+    memory_bandwidth_gbps: 864
+    fp16_tflops: 362
+    fp8_support: true
+    fp4_support: false
+    spec_source: "NVIDIA L40S datasheet 2023"
+    notes_en: "Ada datacenter. 48 GB GDDR6. No NVLink — multi-GPU setups rely on PCIe. Cost-effective for small/medium model inference."
+    notes_zh: "Ada 架构数据中心卡，48 GB GDDR6。无 NVLink，多卡需走 PCIe。中小模型推理性价比高。"
+  - id: L40
+    aliases: [L40-48G]
+    memory_gb: 48
+    nvlink_bandwidth_gbps: 0
+    memory_bandwidth_gbps: 864
+    fp16_tflops: 181
+    fp8_support: true
+    fp4_support: false
+    spec_source: "NVIDIA L40 datasheet 2022"
+    notes_en: "Ada datacenter predecessor to L40S. Same 48 GB, half the compute. Widely deployed in enterprise clouds."
+    notes_zh: "L40S 的前代，Ada 架构数据中心卡。同为 48 GB，算力减半。企业私有云部署量较大。"
+  - id: L4
+    aliases: [L4-24G]
+    memory_gb: 24
+    nvlink_bandwidth_gbps: 0
+    memory_bandwidth_gbps: 300
+    fp16_tflops: 121
+    fp8_support: true
+    fp4_support: false
+    spec_source: "NVIDIA L4 datasheet 2023"
+    notes_en: "Low-profile Ada, 24 GB GDDR6. Common in low-concurrency inference / transcoding. No NVLink."
+    notes_zh: "低功耗 Ada，24 GB GDDR6。常用于低并发推理和转码场景。无 NVLink。"
+  - id: RTX6000-Ada
+    aliases: [RTX-6000-Ada, RTX6000Ada, L6000]
+    memory_gb: 48
+    nvlink_bandwidth_gbps: 0
+    memory_bandwidth_gbps: 960
+    fp16_tflops: 365
+    fp8_support: true
+    fp4_support: false
+    spec_source: "NVIDIA RTX 6000 Ada datasheet 2022"
+    notes_en: "Ada Pro workstation. 48 GB, similar to L40S but for workstations. FP8 yes, no NVLink."
+    notes_zh: "Ada Pro 工作站卡。48 GB，规格接近 L40S 但面向工作站。支持 FP8，无 NVLink。"
+  - id: RTX4090
+    aliases: ["4090", RTX-4090]
+    memory_gb: 24
+    nvlink_bandwidth_gbps: 0
+    memory_bandwidth_gbps: 1008
+    fp16_tflops: 165
+    fp8_support: true
+    fp4_support: false
+    spec_source: "NVIDIA RTX 4090 datasheet 2022"
+    notes_en: "Consumer Ada. No NVLink. Large models need multi-GPU via PCIe (slower)."
+    notes_zh: "消费级 Ada 架构，无 NVLink。大模型多卡只能走 PCIe（明显更慢）。"
+  # ========================================================================
+  # NVIDIA Ampere (2020+)
+  # ========================================================================
+  - id: A100-80G
+    aliases: [A100-80, A100-SXM-80G]
+    memory_gb: 80
+    nvlink_bandwidth_gbps: 600
+    memory_bandwidth_gbps: 2039
+    fp16_tflops: 312
+    fp8_support: false
+    fp4_support: false
+    spec_source: "NVIDIA A100 datasheet 2020"
+    notes_en: "Ampere. No native FP8. Still widely deployed."
+    notes_zh: "Ampere 架构。不原生支持 FP8，但部署量仍然非常大。"
+  - id: A100-40G
+    aliases: [A100-40, A100-SXM-40G]
+    memory_gb: 40
+    nvlink_bandwidth_gbps: 600
+    memory_bandwidth_gbps: 1555
+    fp16_tflops: 312
+    fp8_support: false
+    fp4_support: false
+    spec_source: "NVIDIA A100 40GB datasheet 2020"
+    notes_en: "Ampere 40 GB variant. Smaller HBM limits large-model single-node deployments."
+    notes_zh: "Ampere 的 40 GB 版本，显存较小，大模型单机部署受限。"
+  - id: A40
+    aliases: [A40-48G]
+    memory_gb: 48
+    nvlink_bandwidth_gbps: 112
+    memory_bandwidth_gbps: 696
+    fp16_tflops: 150
+    fp8_support: false
+    fp4_support: false
+    spec_source: "NVIDIA A40 datasheet 2020"
+    notes_en: "Ampere workstation. 48 GB with NVLink bridge (limited bandwidth). No FP8."
+    notes_zh: "Ampere 工作站卡，48 GB + NVLink 桥接（带宽较低）。不支持 FP8。"
+  - id: A10
+    aliases: [A10-24G]
+    memory_gb: 24
+    nvlink_bandwidth_gbps: 0
+    memory_bandwidth_gbps: 600
+    fp16_tflops: 125
+    fp8_support: false
+    fp4_support: false
+    spec_source: "NVIDIA A10 datasheet 2021"
+    notes_en: "Ampere inference card. 24 GB GDDR6. Widely used for low-cost inference in enterprise clouds."
+    notes_zh: "Ampere 推理卡，24 GB GDDR6。企业云低成本推理常用配置。"
+  - id: A10G
+    aliases: [A10G-24G]
+    memory_gb: 24
+    nvlink_bandwidth_gbps: 0
+    memory_bandwidth_gbps: 600
+    fp16_tflops: 125
+    fp8_support: false
+    fp4_support: false
+    spec_source: "NVIDIA A10G — AWS-specific variant of A10, g5 instances"
+    notes_en: "AWS-specific A10 variant. Same silicon as A10, deployed in g5 EC2 instances. No NVLink."
+    notes_zh: "AWS 定制版 A10，用于 g5 EC2 实例。核心规格与 A10 相同，无 NVLink。"
+  # ========================================================================
+  # NVIDIA Volta / Turing (older, still deployed)
+  # ========================================================================
+  - id: V100-SXM2-32G
+    aliases: [V100, V100-32G, V100-SXM2]
+    memory_gb: 32
+    nvlink_bandwidth_gbps: 300
+    memory_bandwidth_gbps: 900
+    fp16_tflops: 125
+    fp8_support: false
+    fp4_support: false
+    spec_source: "NVIDIA V100 SXM2 datasheet 2017"
+    notes_en: "Volta. No FP8. Still deployed in many existing clusters — works for smaller models, tight for 70B+."
+    notes_zh: "Volta 架构。不支持 FP8，但仍在大量老集群中服役。小模型够用，70B+ 紧张。"
+  - id: V100-PCIe-32G
+    aliases: [V100-PCIe, V100-PCI]
+    memory_gb: 32
+    nvlink_bandwidth_gbps: 0
+    memory_bandwidth_gbps: 900
+    fp16_tflops: 112
+    fp8_support: false
+    fp4_support: false
+    spec_source: "NVIDIA V100 PCIe datasheet 2017 — PCIe variant of V100, no NVLink."
+    notes_en: "PCIe version of V100. No NVLink, lower clocks than SXM2. Common in older servers."
+    notes_zh: "V100 的 PCIe 版本，无 NVLink，主频稍低。老服务器常见配置。"
+  - id: T4
+    aliases: [T4-16G]
+    memory_gb: 16
+    nvlink_bandwidth_gbps: 0
+    memory_bandwidth_gbps: 320
+    fp16_tflops: 65
+    fp8_support: false
+    fp4_support: false
+    spec_source: "NVIDIA T4 datasheet 2018"
+    notes_en: "Turing inference card. 16 GB, no NVLink, no FP8. Common as the cheapest cloud GPU option."
+    notes_zh: "Turing 推理卡。16 GB，无 NVLink，无 FP8。各云厂商最便宜的 GPU 选项之一。"
+  # ========================================================================
+  # AMD (ROCm, xGMI instead of NVLink)
+  # ========================================================================
+  - id: MI325X
+    aliases: [MI325X-256G, AMD-MI325X]
+    memory_gb: 256
+    nvlink_bandwidth_gbps: 896
+    memory_bandwidth_gbps: 6000
+    fp16_tflops: 1307
+    fp8_support: true
+    fp4_support: false
+    spec_source: "AMD Instinct MI325X datasheet 2024 — 256 GB HBM3E, 6 TB/s bandwidth, 1000W TDP, CDNA 3."
+    notes_en: "AMD flagship 2024. 256 GB HBM3E (largest single-card memory in v0.1 database). Upgraded MI300X with faster HBM3E and more capacity. Dense FP16 1307 TFLOPS, FP8 2615 TFLOPS. 1000W TDP, OAM format. ROCm software stack."
+    notes_zh: "AMD 2024 年旗舰。256 GB HBM3E（v0.1 数据库中单卡最大）。MI300X 升级版，HBM3E 更快、容量更大。Dense FP16 1307 TFLOPS，FP8 2615 TFLOPS。1000W TDP，OAM 形态。需要 ROCm 软件栈。"
+  - id: MI300X
+    aliases: [MI300X-192G, AMD-MI300X]
+    memory_gb: 192
+    nvlink_bandwidth_gbps: 896
+    memory_bandwidth_gbps: 5300
+    fp16_tflops: 1307
+    fp8_support: true
+    fp4_support: false
+    spec_source: "AMD Instinct MI300X datasheet 2023-12"
+    notes_en: "AMD flagship 2023. 192 GB HBM3. xGMI 896 GB/s (like NVLink). Software stack: ROCm + vLLM. Support for DeepSeek V4 etc. lags Nvidia by weeks."
+    notes_zh: "AMD 2023 年旗舰。192 GB HBM3。xGMI 互联 896 GB/s（类 NVLink）。需要 ROCm + vLLM 栈。新模型支持通常比 NVIDIA 晚几周。"
+  - id: MI250X
+    aliases: [MI250X-128G, AMD-MI250X]
+    memory_gb: 128
+    nvlink_bandwidth_gbps: 800
+    memory_bandwidth_gbps: 3280
+    fp16_tflops: 383
+    fp8_support: false
+    fp4_support: false
+    spec_source: "AMD Instinct MI250X datasheet 2022"
+    notes_en: "AMD previous-gen. 128 GB HBM2e. No FP8. Deployed in some HPC clusters (Frontier)."
+    notes_zh: "AMD 上代数据中心卡。128 GB HBM2e，不支持 FP8。少数 HPC 集群（如 Frontier 超算）有部署。"
+  - id: MI210
+    aliases: [MI210-64G, AMD-MI210]
+    memory_gb: 64
+    nvlink_bandwidth_gbps: 300
+    memory_bandwidth_gbps: 1600
+    fp16_tflops: 181
+    fp8_support: false
+    fp4_support: false
+    spec_source: "AMD Instinct MI210 datasheet 2022 — CDNA 2, single-die version of MI250. 64 GB HBM2e."
+    notes_en: "AMD CDNA 2 single-die. 64 GB HBM2e, 1.6 TB/s. No FP8 (CDNA 2 limitation). Common as entry-level AMD datacenter card."
+    notes_zh: "AMD CDNA 2 单 die 版本，64 GB HBM2e，1.6 TB/s 带宽。不支持 FP8（CDNA 2 限制）。AMD 入门数据中心卡常见配置。"
+  # ========================================================================
+  # Intel Habana Gaudi
+  # ========================================================================
+  - id: Gaudi3
+    aliases: [Gaudi-3, Habana-Gaudi3]
+    memory_gb: 128
+    nvlink_bandwidth_gbps: 1200
+    memory_bandwidth_gbps: 3700
+    fp16_tflops: 1835
+    fp8_support: true
+    fp4_support: false
+    spec_source: "Intel Gaudi 3 datasheet 2024"
+    notes_en: "Intel Habana Gaudi 3. 128 GB HBM2e. FP8 support. Software stack: SynapseAI (not CUDA). vLLM support via Intel fork."
+    notes_zh: "Intel Habana Gaudi 3。128 GB HBM2e，支持 FP8。软件栈为 SynapseAI（非 CUDA）。vLLM 需走 Intel 分支。"
+  - id: Gaudi2
+    aliases: [Gaudi-2, Habana-Gaudi2]
+    memory_gb: 96
+    nvlink_bandwidth_gbps: 2400
+    memory_bandwidth_gbps: 2450
+    fp16_tflops: 432
+    fp8_support: true
+    fp4_support: false
+    spec_source: "Intel Gaudi 2 datasheet 2022"
+    notes_en: "Intel Habana Gaudi 2. 96 GB HBM2e with 24x100GbE on-board (used for scale-out). FP8 support."
+    notes_zh: "Intel Habana Gaudi 2。96 GB HBM2e，板载 24 个 100GbE（用于横向扩展）。支持 FP8。"
+  # ========================================================================
+  # Huawei Ascend
+  # ========================================================================
+  # The 910B "series" is actually a set of sub-variants (B1/B2/B3/B4) with
+  # different compute tiers and memory sizes. `910B` as a plain id resolves
+  # to 910B3 (the most common training configuration).
+  - id: "910A"
+    aliases: [Ascend-910A]
+    memory_gb: 32
+    nvlink_bandwidth_gbps: 400
+    memory_bandwidth_gbps: 1200
+    fp16_tflops: 256
+    fp8_support: false
+    fp4_support: false
+    spec_source: "Ascend 910 (1st gen) — 7nm, 32 GB HBM. Community-compiled spec."
+    notes_en: "Huawei Ascend 910 (1st gen, 2019). Predecessor to 910B. Still deployed in many older clusters. HCCS interconnect."
+    notes_zh: "华为昇腾 910 第一代（2019 年），910B 的前身。很多老集群仍在使用。HCCS 互联。"
+  - id: "910B1"
+    aliases: [Ascend-910B1]
+    memory_gb: 64
+    nvlink_bandwidth_gbps: 400
+    memory_bandwidth_gbps: 1600
+    fp16_tflops: 414
+    fp8_support: false
+    fp4_support: false
+    spec_source: "Ascend 910B1 — training variant, Atlas 800T A2. Commonly cited as top-tier 910B sub-variant; TSMC 7nm process."
+    notes_en: "Top-tier 910B training variant. 64 GB HBM2, 414 TFLOPS FP16. Used in Atlas 800T A2 training servers. No native FP8."
+    notes_zh: "910B 系列顶配训练版本。64 GB HBM2，FP16 算力 414 TFLOPS。搭载于 Atlas 800T A2 训练服务器。不原生支持 FP8。"
+  - id: "910B2"
+    aliases: [Ascend-910B2]
+    memory_gb: 64
+    nvlink_bandwidth_gbps: 400
+    memory_bandwidth_gbps: 1600
+    fp16_tflops: 376
+    fp8_support: false
+    fp4_support: false
+    spec_source: "Ascend 910B2 — training variant, commonly cited as standard 910B training configuration."
+    notes_en: "Standard 910B training variant. 64 GB HBM2, 376 TFLOPS FP16. General-purpose training server baseline."
+    notes_zh: "910B 常规训练版本。64 GB HBM2，FP16 算力 376 TFLOPS。通用训练服务器标准配置。"
+  - id: "910B3"
+    aliases: [Ascend-910B3, "910B", Ascend-910B]
+    memory_gb: 64
+    nvlink_bandwidth_gbps: 400
+    memory_bandwidth_gbps: 1600
+    fp16_tflops: 313
+    fp8_support: false
+    fp4_support: false
+    spec_source: "Ascend 910B3 — training variant, SMIC-produced per industry reports. (aliased as bare `910B` for convenience)"
+    notes_en: "910B3 training variant, 313 TFLOPS FP16. Believed to be SMIC-produced (vs TSMC for B1/B2). The `910B` bare name resolves here since B3 is the most commonly referenced."
+    notes_zh: "910B3 训练版本，FP16 算力 313 TFLOPS。业界普遍认为由中芯国际生产（B1/B2 据传为台积电）。裸写 `910B` 时默认解析到此条目（最常被引用）。"
+  - id: "910B4"
+    aliases: [Ascend-910B4]
+    memory_gb: 32
+    nvlink_bandwidth_gbps: 400
+    memory_bandwidth_gbps: 1600
+    fp16_tflops: 280
+    fp8_support: false
+    fp4_support: false
+    spec_source: "Ascend 910B4 — inference variant, 32 GB HBM (half of B1/B2/B3). Atlas 800I A2 inference server."
+    notes_en: "910B4 is the inference-oriented 910B variant. 32 GB HBM (half of training variants), 280 TFLOPS FP16. Deployed in Atlas 800I A2 inference servers."
+    notes_zh: "910B4 是 910B 系列的推理版本。32 GB HBM（训练版本的一半），FP16 算力 280 TFLOPS。搭载于 Atlas 800I A2 推理服务器。"
+  - id: "910C"
+    aliases: [Ascend-910C]
+    memory_gb: 64
+    nvlink_bandwidth_gbps: 400
+    memory_bandwidth_gbps: 3200
+    fp16_tflops: 780
+    fp8_support: false
+    fp4_support: false
+    spec_source: "Huawei Ascend 910C — launched 2024, commonly cited specs pending official datasheet"
+    notes_en: "Huawei Ascend 910C (2024). Roughly 2x compute vs 910B at similar memory. FP8 support status unclear — check CANN version notes. Software ecosystem matures but still behind NVIDIA."
+    notes_zh: "华为昇腾 910C（2024 年）。算力大约是 910B 的两倍，显存相当。FP8 支持情况需看 CANN 版本。软件生态持续完善但仍落后于 NVIDIA。"
+  - id: Atlas-300I-Duo
+    aliases: [Atlas300IDuo, 300I-Duo]
+    memory_gb: 48
+    nvlink_bandwidth_gbps: 0
+    memory_bandwidth_gbps: 204
+    fp16_tflops: 140
+    fp8_support: false
+    fp4_support: false
+    spec_source: "Huawei Atlas 300I Duo inference card — 2x Ascend 310P per card. 140 TFLOPS FP16 per card, 48 GB LPDDR4X."
+    notes_en: "Huawei Atlas 300I Duo inference card: 2x Ascend 310P with combined 48 GB LPDDR4X (96 GB variant available). 280 TOPS INT8. LPDDR4X gives 204 GB/s total bandwidth — much lower than HBM-based cards. PCIe-only, no NVLink. Best for cost-sensitive inference."
+    notes_zh: "华为 Atlas 300I Duo 推理卡：双 Ascend 310P，合计 48 GB LPDDR4X（另有 96 GB 版本）。INT8 280 TOPS。显存是 LPDDR4X，带宽 204 GB/s，远低于 HBM 卡。仅 PCIe，无 NVLink。主要面向成本敏感的推理场景。"
+  # ========================================================================
+  # Chinese domestic AI accelerators (non-NVIDIA / non-AMD)
+  # ========================================================================
+  - id: MXC500
+    aliases: [MetaX-MXC500, XiYun-C500, 曦云C500]
+    memory_gb: 64
+    nvlink_bandwidth_gbps: 800
+    memory_bandwidth_gbps: 1800
+    fp16_tflops: 240
+    fp8_support: false
+    fp4_support: false
+    spec_source: "MetaX 沐曦 MXC500 / 曦云 C500 (PCIe variant, 350W). OAM variant has 280 TFLOPS FP16 @ 450W. 64 GB HBM2e, 1.8 TB/s memory bandwidth, MetaXLink interconnect."
+    notes_en: "MetaX (沐曦) MXC500. 7nm, CUDA-compatible via MXMACA stack. PCIe variant: 240 TFLOPS FP16, 350W. OAM variant: 280 TFLOPS FP16, 450W. Targets A100-class workloads. No native FP8."
+    notes_zh: "沐曦曦云 C500。7nm 工艺，通过 MXMACA 软件栈兼容 CUDA。PCIe 版本 FP16 240 TFLOPS / 350W，OAM 版本 280 TFLOPS / 450W。对标 A100 场景。不原生支持 FP8。"
+  - id: MXC550
+    aliases: [MetaX-MXC550, XiYun-C550, 曦云C550]
+    memory_gb: 64
+    nvlink_bandwidth_gbps: 896
+    memory_bandwidth_gbps: 1600
+    fp16_tflops: 240
+    fp8_support: false
+    fp4_support: false
+    spec_source: "MetaX 沐曦 MXC550 / 曦云 C550 (OAM, 2024). Partial specs from third-party comparison docs; full datasheet TBD. 8-card fabric bandwidth 896 GB/s."
+    notes_en: "MetaX (沐曦) MXC550 — 2024 OAM-format flagship. Supports OAM 1.5 + 2.0. 8-card fabric bandwidth 896 GB/s. Full specs pending official datasheet — figures here are from third-party comparison articles."
+    notes_zh: "沐曦曦云 C550 — 2024 年 OAM 形态旗舰。支持 OAM 1.5 + 2.0 规范。八卡全互联带宽 896 GB/s。完整规格待官方数据表披露，此处数字来自第三方对比资料。"
+  - id: Kunlun-P800
+    aliases: [KunlunXin-P800, 昆仑芯P800, Kunlun-Gen3]
+    memory_gb: 96
+    nvlink_bandwidth_gbps: 400
+    memory_bandwidth_gbps: 2000
+    fp16_tflops: 345
+    fp8_support: true
+    fp4_support: false
+    spec_source: "KunlunXin P800 (3rd gen, 2024). 96 GB HBM3 (largest among Chinese domestic AI chips). Baidu Cloud uses P800 for first-party inference. Specs partially inferred from public Baidu announcements; official datasheet limited distribution."
+    notes_en: "Baidu KunlunXin P800 — 3rd gen, 2024. 96 GB HBM3. Reported to support 8-bit inference and MoE optimizations. Baidu's internal clusters run Kunlun P800 at 10k+ card scale. Figures here are from public Baidu materials; official spec sheet not fully public."
+    notes_zh: "百度昆仑芯 P800 — 第三代，2024 年。96 GB HBM3（国产 AI 芯片中显存最大之一）。报告支持 8bit 推理和 MoE 优化。百度内部 1 万卡以上规模部署。数字来自百度公开资料，完整规格表未完全披露。"
+  - id: Kunlun-R200
+    aliases: [KunlunXin-R200, 昆仑芯R200, Kunlun-Gen2]
+    memory_gb: 32
+    nvlink_bandwidth_gbps: 200
+    memory_bandwidth_gbps: 512
+    fp16_tflops: 128
+    fp8_support: false
+    fp4_support: false
+    spec_source: "KunlunXin R200 (2nd gen, 2021). 7nm XPU architecture. FP16 128 TFLOPS / INT8 256 TOPS."
+    notes_en: "Baidu KunlunXin R200 — 2nd gen, 7nm. FP16 128 TFLOPS, INT8 256 TOPS. XPU architecture. PCIe 4.0 + XCCL interconnect. No FP8."
+    notes_zh: "百度昆仑芯 R200 — 第二代，7nm XPU 架构。FP16 128 TFLOPS，INT8 256 TOPS。PCIe 4.0 + 昆仑芯互联 XCCL。无 FP8。"
+  - id: BR100
+    aliases: [Biren-BR100, 壁仞BR100, 壁砺100]
+    memory_gb: 64
+    nvlink_bandwidth_gbps: 512
+    memory_bandwidth_gbps: 1640
+    fp16_tflops: 1024
+    fp8_support: false
+    fp4_support: false
+    spec_source: "Biren 壁仞 BR100 (OAM, 550W). 7nm Chiplet, 77B transistors. BF16/FP16 1024 TFLOPS, INT8 2048 TOPS, 64 GB HBM2e 1.64 TB/s. BLINK 512 GB/s 8-card fabric."
+    notes_en: "Biren BR100 (壁仞) — 2022 flagship. OAM format, 550W. 1024 TFLOPS BF16/FP16 (PFLOPS class), 64 GB HBM2e. BLINK interconnect 512 GB/s (8-card fabric). No FP8. US export-restricted since 2022 — production status uncertain."
+    notes_zh: "壁仞 BR100 — 2022 年旗舰 OAM 卡，550W。BF16/FP16 1024 TFLOPS（PFLOPS 级），64 GB HBM2e。BLINK 互联 512 GB/s（8 卡全互联）。无 FP8。2022 年被美国出口管制，后续量产状态不明。"
+  - id: BR104
+    aliases: [Biren-BR104, 壁仞BR104, 壁砺104]
+    memory_gb: 32
+    nvlink_bandwidth_gbps: 128
+    memory_bandwidth_gbps: 820
+    fp16_tflops: 512
+    fp8_support: false
+    fp4_support: false
+    spec_source: "Biren 壁仞 BR104 (PCIe, 300W). Single-die version of BR100 with halved specs. BF16/FP16 512 TFLOPS, 32 GB HBM2e. Won MLPerf Inference ResNet50 and BERT single-card top-1 in its class."
+    notes_en: "Biren BR104 — PCIe single-die version of BR100. 300W, 512 TFLOPS BF16/FP16, 32 GB HBM2e. Won MLPerf Inference BERT (1.58x A100 in server mode). No FP8. Export-restricted."
+    notes_zh: "壁仞 BR104 — BR100 的单 die PCIe 版本。300W，BF16/FP16 512 TFLOPS，32 GB HBM2e。MLPerf Inference BERT 测试 server 模式性能达 A100 的 1.58 倍。无 FP8。已被出口管制。"
+  - id: BI-V100
+    aliases: [Iluvatar-BI-V100, 天数天垓100, TianGai-100]
+    memory_gb: 32
+    nvlink_bandwidth_gbps: 64
+    memory_bandwidth_gbps: 1200
+    fp16_tflops: 147
+    fp8_support: false
+    fp4_support: false
+    spec_source: "Iluvatar CoreX 天数智芯 BI-V100 (天垓100). 7nm, SIMT, 24B transistors, 2.5D CoWoS packaging. FP16 147 TFLOPS / INT8 295 TOPS. 32 GB HBM2, 1.2 TB/s bandwidth. PCIe 4.0 x16, 250W TDP."
+    notes_en: "Iluvatar (天数智芯) BI-V100 — training/general-purpose. 7nm SIMT architecture, 32 GB HBM2, 1.2 TB/s memory bandwidth. FP16 147 TFLOPS, INT8 295 TOPS. 250W TDP. Interconnect bandwidth per card is modest (~64 GB/s shared)."
+    notes_zh: "天数智芯 BI-V100（天垓100）— 训练/通用 GPU。7nm SIMT 架构，32 GB HBM2，1.2 TB/s 显存带宽。FP16 147 TFLOPS，INT8 295 TOPS。250W TDP。单卡互联带宽 ~64 GB/s，相对较低。"
+  - id: MR-V100
+    aliases: [Iluvatar-MR-V100, 天数智铠100, ZhiKai-100]
+    memory_gb: 32
+    nvlink_bandwidth_gbps: 0
+    memory_bandwidth_gbps: 1200
+    fp16_tflops: 100
+    fp8_support: false
+    fp4_support: false
+    spec_source: "Iluvatar CoreX 天数智芯 智铠100 (MR-V100) 2022. Inference card, 32 GB HBM2E, ~200 TFLOPS BF16/FP16-low-precision-aggregated, 128-channel 1080p video decode, 150W TDP."
+    notes_en: "Iluvatar inference card (智铠100). 32 GB HBM2E. 150W TDP. Primarily inference-focused — mixed-precision aggregated throughput ~200 TFLOPS."
+    notes_zh: "天数智芯智铠100 推理卡。32 GB HBM2E，150W TDP。主要面向推理场景，混合精度聚合算力约 200 TFLOPS。"
+  - id: MLU370-X8
+    aliases: [Cambricon-MLU370-X8, 寒武纪MLU370-X8, 思元370-X8]
+    memory_gb: 48
+    nvlink_bandwidth_gbps: 200
+    memory_bandwidth_gbps: 614
+    fp16_tflops: 48
+    fp8_support: false
+    fp4_support: false
+    spec_source: "Cambricon 寒武纪 MLU370-X8 (dual MLU370 chiplet, 250W). 48 GB LPDDR5, INT8 256 TOPS, FP32 24 TFLOPS (FP16 ~48 TFLOPS estimated, official not given). MLU-Link 200 GB/s."
+    notes_en: "Cambricon (寒武纪) MLU370-X8 — dual-chip package, 250W. 48 GB LPDDR5 (not HBM), INT8 256 TOPS, FP32 24 TFLOPS. MLU-Link 200 GB/s for 8-card setups. LPDDR5 means lower memory bandwidth than HBM cards."
+    notes_zh: "寒武纪 MLU370-X8 — 双芯粒封装，250W。48 GB LPDDR5（非 HBM），INT8 256 TOPS，FP32 24 TFLOPS。MLU-Link 200 GB/s，支持 8 卡部署。LPDDR5 意味着显存带宽低于 HBM 卡。"
+  - id: MLU590
+    aliases: [Cambricon-MLU590, 寒武纪MLU590, 思元590]
+    memory_gb: 80
+    nvlink_bandwidth_gbps: 372
+    memory_bandwidth_gbps: 2000
+    fp16_tflops: 314
+    fp8_support: false
+    fp4_support: false
+    spec_source: "Cambricon 寒武纪 思元590 (MLU590) — 7nm, MLUv02/MLUarch05. 80 GB HBM (likely HBM2e based on 2 TB/s bandwidth), FP16 314 TFLOPS, FP32 80 TFLOPS, MLU-Link 372 GB/s. Used at Baidu ERNIE (文心一言) project."
+    notes_en: "Cambricon (寒武纪) MLU590 — flagship AI training chip. 80 GB HBM, 2 TB/s memory bandwidth. FP16 314 TFLOPS (dense). MLU-Link 372 GB/s 8-card fabric. Comparable FP16 compute to NVIDIA A100 level. No FP8. Production volume and ecosystem still maturing."
+    notes_zh: "寒武纪思元590 — 旗舰 AI 训练芯片。80 GB HBM，2 TB/s 显存带宽。FP16 314 TFLOPS（dense），综合性能约为 A100 级别。MLU-Link 372 GB/s 八卡互联。无 FP8。量产规模和生态仍在成熟。"
+  - id: Hygon-K100-AI
+    aliases: [K100-AI, 海光K100AI, DCU-K100-AI]
+    memory_gb: 64
+    nvlink_bandwidth_gbps: 184
+    memory_bandwidth_gbps: 896
+    fp16_tflops: 192
+    fp8_support: false
+    fp4_support: false
+    spec_source: "Hygon 海光 K100 AI — DCU architecture (GPGPU+AI hybrid), 64 GB HBM, 896 GB/s memory bandwidth, 350W TDP. FP16 192 TFLOPS dense (some sources cite 256 TFLOPS but values vary). xGMI 184 GB/s."
+    notes_en: "Hygon (海光) K100 AI — DCU series. 64 GB HBM, 896 GB/s bandwidth. FP16 192 TFLOPS (industry reports vary 100-256 TFLOPS depending on compute unit/mode). ROCm-compatible, can leverage AMD software ecosystem. Positioned against A800 for Chinese market. 350W TDP."
+    notes_zh: "海光 K100 AI — DCU 系列。64 GB HBM，896 GB/s 带宽。FP16 192 TFLOPS（公开资料数字因计算单元和精度模式不同有 100-256 TFLOPS 差异）。兼容 ROCm，可复用 AMD 软件生态。面向国产 A800 替代场景。350W TDP。"
+  - id: Hygon-Z100
+    aliases: [Z100, 海光Z100, DCU-Z100, 深算二号]
+    memory_gb: 32
+    nvlink_bandwidth_gbps: 184
+    memory_bandwidth_gbps: 1000
+    fp16_tflops: 180
+    fp8_support: false
+    fp4_support: false
+    spec_source: "Hygon 海光 DCU Z100 (深算二号) — 32 GB HBM2, 1 TB/s bandwidth, 8192 compute cores, FP32 90 TFLOPS, FP16 ~180 TFLOPS (2x FP32), FP64 10.8 TFLOPS. xGMI 184 GB/s. Performance reported as 80-90% of A100. 350W TDP."
+    notes_en: "Hygon (海光) DCU Z100 / 深算二号. 32 GB HBM2, 1 TB/s bandwidth, 8192 compute units. FP16 180 TFLOPS, FP32 90 TFLOPS, FP64 10.8 TFLOPS. 350W. Performance cited at 80-90% of A100. ROCm stack, PCIe Gen4 + xGMI multi-card."
+    notes_zh: "海光 DCU Z100（深算二号）。32 GB HBM2，1 TB/s 带宽，8192 计算单元。FP16 180 TFLOPS，FP32 90 TFLOPS，FP64 10.8 TFLOPS。350W。综合性能约为 A100 的 80-90%。基于 ROCm 栈，PCIe Gen4 + xGMI 多卡互联。"
+  - id: MTT-S4000
+    aliases: [MooreThreads-S4000, 摩尔线程S4000, MTT-S4000-48G]
+    memory_gb: 48
+    nvlink_bandwidth_gbps: 240
+    memory_bandwidth_gbps: 768
+    fp16_tflops: 100
+    fp8_support: false
+    fp4_support: false
+    spec_source: "Moore Threads MTT S4000 datasheet 2023 — 3rd-gen MUSA (曲院). 48 GB GDDR6, 768 GB/s bandwidth. FP16/BF16 100 TFLOPS, INT8 200 TOPS. MTLink 1.0 240 GB/s."
+    notes_en: "Moore Threads (摩尔线程) S4000 — domestic AI training card. 48 GB GDDR6 (not HBM), 768 GB/s. FP16/BF16 100 TFLOPS. MTLink 1.0 240 GB/s. CUDA compatibility via MUSA translation."
+    notes_zh: "摩尔线程 S4000 — 国产训推加速卡。48 GB GDDR6（非 HBM），768 GB/s 带宽。FP16/BF16 100 TFLOPS。MTLink 1.0 互联 240 GB/s。通过 MUSA 兼容 CUDA 生态。"
+  - id: MTT-S3000
+    aliases: [MooreThreads-S3000, 摩尔线程S3000]
+    memory_gb: 32
+    nvlink_bandwidth_gbps: 0
+    memory_bandwidth_gbps: 448
+    fp16_tflops: 30
+    fp8_support: false
+    fp4_support: false
+    spec_source: "Moore Threads MTT S3000 — MUSA 春晓 architecture. 32 GB GDDR6, 448 GB/s. FP32 ~15.2 TFLOPS inferred from S4000 comparison (S4000 is 64%+ higher); FP16 ~30 TFLOPS estimate (datasheet not fully public)."
+    notes_en: "Moore Threads (摩尔线程) S3000 — predecessor to S4000. 32 GB GDDR6, 448 GB/s. FP16 specs not fully published; estimated ~30 TFLOPS based on S4000 comparison. Multi-purpose server GPU, also supports rendering."
+    notes_zh: "摩尔线程 S3000 — S4000 的前代。32 GB GDDR6，448 GB/s。FP16 官方未完全披露，基于 S4000 对比推算约 30 TFLOPS。通用服务器 GPU，兼顾渲染场景。"

src/llm_cal/hardware/loader.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""Hardware database loader + lookup."""
+from __future__ import annotations
+from functools import lru_cache
+from importlib.resources import files
+from pathlib import Path
+from typing import Literal
+from pydantic import BaseModel, Field
+from llm_cal.common.yaml_loader import load_yaml
+class GPUSpec(BaseModel):
+    """One GPU entry in the hardware database."""
+    id: str
+    aliases: list[str] = Field(default_factory=list)
+    memory_gb: int
+    nvlink_bandwidth_gbps: int
+    # HBM/GDDR memory bandwidth (NOT NVLink). This is the critical number for
+    # decode throughput: decode is memory-bandwidth-bound, and per-token
+    # latency = active_weight_bytes / (memory_bandwidth × utilization).
+    # 0 or None means unknown (performance module will skip bandwidth checks).
+    memory_bandwidth_gbps: int | None = None
+    fp16_tflops: float
+    fp8_support: bool
+    fp4_support: bool
+    notes_en: str | None = None
+    notes_zh: str | None = None
+    # Where the numeric specs came from. A URL to a vendor datasheet / trusted
+    # benchmark, or a short note like "NVIDIA H100 datasheet 2024-Q3". Lets
+    # users audit the source; honesty-over-convenience principle.
+    spec_source: str | None = None
+    def localized_notes(self, locale: Literal["en", "zh"]) -> str | None:
+        if locale == "zh":
+            return self.notes_zh or self.notes_en
+        return self.notes_en or self.notes_zh
+class GPUDatabase(BaseModel):
+    schema_version: int
+    gpus: list[GPUSpec]
+class UnknownGPUError(Exception):
+    """User asked for a GPU id we don't know."""
+def _default_path() -> Path:
+    """Locate the bundled gpu_database.yaml inside the installed package."""
+    return Path(str(files("llm_cal.hardware").joinpath("gpu_database.yaml")))
+@lru_cache(maxsize=1)
+def load_database(path: Path | None = None) -> GPUDatabase:
+    return load_yaml(path or _default_path(), GPUDatabase)
+def lookup(gpu: str, db: GPUDatabase | None = None) -> GPUSpec:
+    """Look up a GPU by id or alias. Case-insensitive."""
+    database = db or load_database()
+    target = gpu.strip().upper()
+    for spec in database.gpus:
+        if spec.id.upper() == target:
+            return spec
+        if any(alias.upper() == target for alias in spec.aliases):
+            return spec
+    # Helpful rejection
+    if "X" in target and target.split("X")[-1].isdigit():
+        raise UnknownGPUError(
+            f"'{gpu}' looks like old 'H800x8' format. "
+            f"Use `--gpu {target.split('X')[0]} --gpu-count {target.split('X')[-1]}` instead."
+        )
+    raise UnknownGPUError(f"Unknown GPU '{gpu}'. Known: {', '.join(s.id for s in database.gpus)}")

src/llm_cal/llm_review/__init__.py ADDED Viewed

File without changes

src/llm_cal/llm_review/reviewer.py ADDED Viewed

	@@ -0,0 +1,218 @@

+"""Optional LLM-based second opinion on the tool's derivation trace.
+Design constraints (from the tool's honesty principle):
+  1. Never overrides the 6 primary labels. LLM responses are tagged
+     [llm-opinion] — a distinct 7th label.
+  2. Opt-in only — requires --llm-review flag AND env vars set.
+  3. Non-fatal — if the API call fails, the main report still works.
+  4. User-chosen provider — supports any OpenAI-compatible endpoint
+     (OpenAI, DeepSeek, Moonshot, Zhipu, local vLLM, etc.)
+  5. Deterministic input — the prompt is built from the --explain
+     derivation trace, not free-form. The LLM gets structured math,
+     not prose.
+  6. The LLM's job is to CRITIQUE, not to REWRITE. The prompt
+     explicitly forbids generating new numbers.
+Environment variables:
+  LLM_CAL_REVIEWER_API_KEY   (required)
+  LLM_CAL_REVIEWER_BASE_URL  (default: https://api.openai.com/v1)
+  LLM_CAL_REVIEWER_MODEL     (default: gpt-4o)
+"""
+from __future__ import annotations
+import os
+from dataclasses import dataclass
+from typing import Literal
+import httpx
+from llm_cal.core.explain import ExplainEntry
+Locale = Literal["en", "zh"]
+@dataclass(frozen=True)
+class LLMReviewResult:
+    ok: bool
+    content: str | None
+    error: str | None
+    model: str
+    base_url: str
+def run_review(
+    entries: list[ExplainEntry],
+    locale: Locale,
+    timeout_s: float = 60.0,
+) -> LLMReviewResult:
+    """Send the derivation trace to an LLM for audit.
+    Returns a LLMReviewResult. Never raises — always returns a result
+    object even on failure.
+    """
+    api_key = os.environ.get("LLM_CAL_REVIEWER_API_KEY")
+    base_url = os.environ.get("LLM_CAL_REVIEWER_BASE_URL", "https://api.openai.com/v1").rstrip("/")
+    model = os.environ.get("LLM_CAL_REVIEWER_MODEL", "gpt-4o")
+    if not api_key:
+        return LLMReviewResult(
+            ok=False,
+            content=None,
+            error=(
+                "LLM_CAL_REVIEWER_API_KEY env var not set. "
+                "Set it to the API key of an OpenAI-compatible endpoint "
+                "(OpenAI, DeepSeek, Moonshot, Zhipu, etc.)."
+            ),
+            model=model,
+            base_url=base_url,
+        )
+    prompt = _build_prompt(entries, locale)
+    try:
+        with httpx.Client(timeout=timeout_s) as client:
+            resp = client.post(
+                f"{base_url}/chat/completions",
+                headers={
+                    "Authorization": f"Bearer {api_key}",
+                    "Content-Type": "application/json",
+                },
+                json={
+                    "model": model,
+                    "messages": [
+                        {"role": "system", "content": _system_prompt(locale)},
+                        {"role": "user", "content": prompt},
+                    ],
+                    "temperature": 0.1,
+                    "max_tokens": 6000,
+                },
+            )
+    except (httpx.TimeoutException, httpx.ConnectError) as e:
+        return LLMReviewResult(
+            ok=False,
+            content=None,
+            error=f"{type(e).__name__}: {e}",
+            model=model,
+            base_url=base_url,
+        )
+    if resp.status_code != 200:
+        return LLMReviewResult(
+            ok=False,
+            content=None,
+            error=f"HTTP {resp.status_code}: {resp.text[:500]}",
+            model=model,
+            base_url=base_url,
+        )
+    try:
+        data = resp.json()
+        content = data["choices"][0]["message"]["content"]
+    except (KeyError, ValueError) as e:
+        return LLMReviewResult(
+            ok=False,
+            content=None,
+            error=f"Malformed response: {type(e).__name__}: {e}",
+            model=model,
+            base_url=base_url,
+        )
+    return LLMReviewResult(ok=True, content=content, error=None, model=model, base_url=base_url)
+def _system_prompt(locale: Locale) -> str:
+    if locale == "zh":
+        return (
+            "你是一个大模型推理硬件计算工具的独立审计者。工具产出确定性的推导链，"
+            "你的工作是发现数学错误、不合理假设或遗漏。你不负责重新计算，"
+            "只负责评论和确认。输出简体中文。"
+        )
+    return (
+        "You are an independent auditor for a deterministic LLM inference hardware "
+        "calculator. The tool produces a derivation trace; your job is to find math "
+        "errors, unreasonable assumptions, or missing considerations. You do NOT "
+        "recalculate; you only critique and confirm."
+    )
+def _build_prompt(entries: list[ExplainEntry], locale: Locale) -> str:
+    trace = "\n\n".join(_format_entry(e) for e in entries)
+    if locale == "zh":
+        return _prompt_zh(trace)
+    return _prompt_en(trace)
+def _format_entry(entry: ExplainEntry) -> str:
+    parts: list[str] = [f"## {entry.heading}"]
+    parts.append(f"Formula:\n{entry.formula}")
+    if entry.inputs:
+        parts.append("Inputs:")
+        for inp in entry.inputs:
+            note = f" ({inp.note})" if inp.note else ""
+            parts.append(f"  - {inp.name} = {inp.value} {inp.label}{note}")
+    if entry.steps:
+        parts.append("Steps:")
+        for step in entry.steps:
+            parts.append(f"  {step}")
+    parts.append(f"Result: {entry.result}")
+    if entry.source:
+        parts.append(f"Source: {entry.source}")
+    return "\n".join(parts)
+def _prompt_en(trace: str) -> str:
+    return f"""The deterministic tool produced this derivation trace for one model evaluation. \
+Audit it.
+<DERIVATION_TRACE>
+{trace}
+</DERIVATION_TRACE>
+Respond in this structure. If a section has nothing to flag, write "none".
+## Critical issues
+(math errors or wrong formulas — would give wrong final answer)
+## Moderate concerns
+(unreasonable assumptions, factors off by 2x+, missing TP/sharding effects, etc.)
+## Minor notes
+(clarifications, stylistic, optional improvements)
+## Consensus check
+(which ExplainEntry headings look correct? name them explicitly)
+Rules:
+  - Cite specific ExplainEntry heading names. Be concrete.
+  - Do NOT produce new numbers. Only critique.
+  - If you don't know, say so. Do not hallucinate.
+  - All your output must be tagged as a second opinion, NOT authoritative."""
+def _prompt_zh(trace: str) -> str:
+    return f"""下面是工具产出的一份完整推导链。请审计。
+<DERIVATION_TRACE>
+{trace}
+</DERIVATION_TRACE>
+按下面结构回复。没内容的段落写"无"。
+## 关键错误
+（数学错误或公式错误 —— 会导致最终答案错）
+## 中度疑虑
+（不合理假设、因子偏差 2x+、遗漏的 TP 分摊等）
+## 次要备注
+（澄清、风格、可选改进）
+## 一致性核查
+（哪些 ExplainEntry 标题看起来是对的？明确列出）
+规则：
+  - 必须引用具体的 ExplainEntry 标题名。具体点。
+  - 不要产出新数字，只做评论。
+  - 不确定的地方直说。不要编造。
+  - 你的所有输出都只是 second opinion，不是权威答案。"""

src/llm_cal/model_source/__init__.py ADDED Viewed

File without changes

src/llm_cal/model_source/auth.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""Token discovery + user-friendly auth error messages."""
+from __future__ import annotations
+import os
+def get_hf_token() -> str | None:
+    """Read HF token from standard env vars.
+    `HF_TOKEN` wins over `HUGGING_FACE_HUB_TOKEN` for consistency with the
+    huggingface-cli default.
+    """
+    return os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
+def get_modelscope_token() -> str | None:
+    return os.environ.get("MODELSCOPE_API_TOKEN") or os.environ.get("MODELSCOPE_TOKEN")
+def hf_auth_error_message(model_id: str) -> str:
+    return (
+        f"Model '{model_id}' requires authentication (gated or private).\n"
+        "Set HF_TOKEN env var or run: huggingface-cli login"
+    )
+def modelscope_auth_error_message(model_id: str) -> str:
+    # Chinese user-facing message — full-width punctuation is intentional.
+    return (
+        f"模型 '{model_id}' 需要登录（gated 或 私有）。\n"
+        "设置 MODELSCOPE_API_TOKEN 环境变量，或执行：modelscope login"
+    )

src/llm_cal/model_source/base.py ADDED Viewed

	@@ -0,0 +1,58 @@

+"""ModelSource ABC — HF and ModelScope implement this."""
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any
+@dataclass(frozen=True)
+class SiblingFile:
+    """One file in the model repo. `size` is bytes, or None if unknown."""
+    filename: str
+    size: int | None
+@dataclass(frozen=True)
+class ModelArtifact:
+    """The raw material a ModelSource returns.
+    We do NOT interpret anything here — interpretation lives in `architecture/`
+    and `weight_analyzer/`. This is the thin "fetch" layer.
+    """
+    source: str  # "huggingface" | "modelscope"
+    model_id: str
+    commit_sha: str | None  # HF provides this; used as cache key component
+    config: dict[str, Any]  # parsed config.json
+    siblings: tuple[SiblingFile, ...]  # all files in the repo
+class ModelNotFoundError(Exception):
+    """Model id does not exist on this source."""
+class AuthRequiredError(Exception):
+    """Model is gated / private — user must set a token."""
+class SourceUnavailableError(Exception):
+    """Network error, timeout, rate limit, etc."""
+class ModelSource(ABC):
+    """Abstract interface for HF / ModelScope / future sources."""
+    name: str  # subclasses override
+    @abstractmethod
+    def fetch(self, model_id: str) -> ModelArtifact:
+        """Fetch config.json + siblings for the given model.
+        Raises:
+            ModelNotFoundError: 404.
+            AuthRequiredError: 401/403 (gated/private).
+            SourceUnavailableError: 429, 5xx, timeout, network down.
+        """

src/llm_cal/model_source/huggingface.py ADDED Viewed

	@@ -0,0 +1,118 @@

+"""HuggingFace source. Uses `huggingface_hub` for metadata + `httpx` for config fetch.
+Anti-pattern warning: do NOT call `list_repo_files()` then head-request each file.
+Always use `model_info(files_metadata=True)` which returns all sibling sizes in
+ONE request. Verified in `tests/test_hf.py` by asserting HTTP call count.
+"""
+from __future__ import annotations
+import json
+from typing import Any
+import httpx
+from huggingface_hub import HfApi
+from huggingface_hub.utils import (
+    GatedRepoError,
+    HfHubHTTPError,
+    RepositoryNotFoundError,
+)
+from llm_cal.model_source.auth import get_hf_token, hf_auth_error_message
+from llm_cal.model_source.base import (
+    AuthRequiredError,
+    ModelArtifact,
+    ModelNotFoundError,
+    ModelSource,
+    SiblingFile,
+    SourceUnavailableError,
+)
+_CONFIG_URL = "https://huggingface.co/{model_id}/resolve/{revision}/config.json"
+class HuggingFaceSource(ModelSource):
+    name = "huggingface"
+    def __init__(self, endpoint: str | None = None, timeout_s: float = 30.0) -> None:
+        # huggingface_hub picks up HF_ENDPOINT env; we pass through for explicitness
+        self._api = HfApi(endpoint=endpoint, token=get_hf_token())
+        self._timeout_s = timeout_s
+        self._endpoint = endpoint or "https://huggingface.co"
+    def fetch(self, model_id: str) -> ModelArtifact:
+        token = get_hf_token()
+        # Step 1: siblings + commit sha in ONE request.
+        # CRITICAL: files_metadata=True — see module docstring.
+        try:
+            info = self._api.model_info(
+                repo_id=model_id,
+                files_metadata=True,
+                token=token,
+            )
+        except RepositoryNotFoundError as e:
+            raise ModelNotFoundError(f"Model '{model_id}' not found on HuggingFace.") from e
+        except GatedRepoError as e:
+            raise AuthRequiredError(hf_auth_error_message(model_id)) from e
+        except HfHubHTTPError as e:
+            status = getattr(e.response, "status_code", None)
+            if status in (401, 403):
+                raise AuthRequiredError(hf_auth_error_message(model_id)) from e
+            if status == 429:
+                retry = e.response.headers.get("Retry-After", "unknown")
+                raise SourceUnavailableError(
+                    f"HuggingFace rate limit (429). Retry-After: {retry}s. "
+                    "Setting HF_TOKEN increases your quota."
+                ) from e
+            raise SourceUnavailableError(f"HuggingFace error ({status}): {e}") from e
+        except (httpx.TimeoutException, TimeoutError) as e:
+            raise SourceUnavailableError(
+                f"HuggingFace request timed out after {self._timeout_s}s."
+            ) from e
+        siblings = tuple(
+            SiblingFile(filename=s.rfilename, size=s.size) for s in (info.siblings or [])
+        )
+        commit_sha = info.sha
+        # Step 2: fetch config.json. If commit sha is available, pin to it so we don't
+        # race with repo updates between the two calls.
+        config = self._fetch_config(model_id, commit_sha or "main", token)
+        return ModelArtifact(
+            source=self.name,
+            model_id=model_id,
+            commit_sha=commit_sha,
+            config=config,
+            siblings=siblings,
+        )
+    def _fetch_config(self, model_id: str, revision: str, token: str | None) -> dict[str, Any]:
+        url = _CONFIG_URL.format(model_id=model_id, revision=revision)
+        headers = {"Authorization": f"Bearer {token}"} if token else {}
+        try:
+            resp = httpx.get(url, headers=headers, timeout=self._timeout_s, follow_redirects=True)
+        except (httpx.TimeoutException, httpx.ConnectError) as e:
+            raise SourceUnavailableError(f"config.json fetch failed: {e}") from e
+        if resp.status_code == 404:
+            raise ModelNotFoundError(
+                f"Model '{model_id}' exists but has no config.json. "
+                "May be a GGUF-only or dataset repo (not supported in v0.1)."
+            )
+        if resp.status_code in (401, 403):
+            raise AuthRequiredError(hf_auth_error_message(model_id))
+        if resp.status_code == 429:
+            retry = resp.headers.get("Retry-After", "unknown")
+            raise SourceUnavailableError(f"HuggingFace rate limit (429). Retry-After: {retry}s.")
+        if resp.status_code >= 400:
+            raise SourceUnavailableError(f"config.json fetch returned HTTP {resp.status_code}")
+        try:
+            parsed: dict[str, Any] = json.loads(resp.text)
+        except json.JSONDecodeError as e:
+            raise SourceUnavailableError(
+                f"config.json is not valid JSON (line {e.lineno} col {e.colno}): {e.msg}"
+            ) from e
+        return parsed

src/llm_cal/model_source/modelscope.py ADDED Viewed

	@@ -0,0 +1,229 @@

+"""ModelScope source — REST-only via httpx.
+Decision: Option B from ADR-001. We don't need the official `modelscope` SDK
+because llm-cal only requires three things:
+  1. List repo files + sizes (one API call)
+  2. Fetch config.json (one API call)
+  3. Range-GET a safetensors header (handled by safetensors_reader)
+The SDK pulls heavy ML deps by default (torch / tf for some install paths).
+REST keeps the install footprint flat, mirrors the existing httpx hot path,
+and gives us identical exception semantics across HF + MS.
+Endpoints (verified against modelscope.cn public docs, 2026-04):
+  * GET /api/v1/models/{owner}/{name}                       — model meta
+  * GET /api/v1/models/{owner}/{name}/repo/files?Recursive=true
+                                                            — file tree + sizes
+  * GET /api/v1/models/{owner}/{name}/repo?FilePath=...&Revision=...
+                                                            — raw file content
+ModelScope wraps every response in a {Code, Message, Data, Success} envelope.
+Field casing is PascalCase. We parse defensively — fields may evolve.
+"""
+from __future__ import annotations
+import json
+from typing import Any
+import httpx
+from llm_cal.model_source.auth import (
+    get_modelscope_token,
+    modelscope_auth_error_message,
+)
+from llm_cal.model_source.base import (
+    AuthRequiredError,
+    ModelArtifact,
+    ModelNotFoundError,
+    ModelSource,
+    SiblingFile,
+    SourceUnavailableError,
+)
+DEFAULT_ENDPOINT = "https://www.modelscope.cn"
+DEFAULT_REVISION = "master"
+_INFO_PATH = "/api/v1/models/{model_id}"
+_FILES_PATH = "/api/v1/models/{model_id}/repo/files"
+_RAW_PATH = "/api/v1/models/{model_id}/repo"
+class ModelScopeSource(ModelSource):
+    name = "modelscope"
+    def __init__(
+        self,
+        endpoint: str | None = None,
+        timeout_s: float = 30.0,
+        revision: str = DEFAULT_REVISION,
+    ) -> None:
+        self._endpoint = (endpoint or DEFAULT_ENDPOINT).rstrip("/")
+        self._timeout_s = timeout_s
+        self._revision = revision
+    def fetch(self, model_id: str) -> ModelArtifact:
+        token = get_modelscope_token()
+        headers = self._auth_headers(token)
+        # Step 1: model info — gives us LatestSha (commit pin) when available.
+        # We tolerate missing info; fall back to revision="master" so that the
+        # file list + config calls still work.
+        commit_sha = self._fetch_commit_sha(model_id, headers)
+        # Step 2: file tree with sizes. ONE call, recursive, includes sub-folders.
+        siblings = self._list_files(model_id, commit_sha or self._revision, headers)
+        # Step 3: config.json. Pin to the commit sha when we have it so two
+        # back-to-back calls don't race against a repo update.
+        config = self._fetch_config(model_id, commit_sha or self._revision, headers)
+        return ModelArtifact(
+            source=self.name,
+            model_id=model_id,
+            commit_sha=commit_sha,
+            config=config,
+            siblings=siblings,
+        )
+    # ------------------------------------------------------------------ helpers
+    def _auth_headers(self, token: str | None) -> dict[str, str]:
+        return {"Authorization": f"Bearer {token}"} if token else {}
+    def _fetch_commit_sha(self, model_id: str, headers: dict[str, str]) -> str | None:
+        url = f"{self._endpoint}{_INFO_PATH.format(model_id=model_id)}"
+        try:
+            resp = httpx.get(
+                url, headers=headers, timeout=self._timeout_s, follow_redirects=True
+            )
+        except (httpx.TimeoutException, httpx.ConnectError, httpx.HTTPError):
+            # Soft fail — commit sha is best-effort. Caller will use "master".
+            return None
+        if resp.status_code != 200:
+            return None
+        try:
+            payload = resp.json()
+        except json.JSONDecodeError:
+            return None
+        data = payload.get("Data") if isinstance(payload, dict) else None
+        if not isinstance(data, dict):
+            return None
+        # Field name has bounced between LatestSha / latest_sha / Revision in
+        # historical docs; check several.
+        for key in ("LatestSha", "latest_sha", "Revision", "Sha"):
+            v = data.get(key)
+            if isinstance(v, str) and v:
+                return v
+        return None
+    def _list_files(
+        self, model_id: str, revision: str, headers: dict[str, str]
+    ) -> tuple[SiblingFile, ...]:
+        url = f"{self._endpoint}{_FILES_PATH.format(model_id=model_id)}"
+        params = {"Recursive": "true", "Revision": revision}
+        try:
+            resp = httpx.get(
+                url,
+                headers=headers,
+                params=params,
+                timeout=self._timeout_s,
+                follow_redirects=True,
+            )
+        except (httpx.TimeoutException, httpx.ConnectError) as e:
+            raise SourceUnavailableError(f"ModelScope file list failed: {e}") from e
+        self._raise_for_status(resp, model_id, what="file list")
+        try:
+            payload = resp.json()
+        except json.JSONDecodeError as e:
+            raise SourceUnavailableError(
+                f"ModelScope file list returned non-JSON: {e}"
+            ) from e
+        files = _extract_files(payload)
+        if files is None:
+            raise SourceUnavailableError(
+                "ModelScope file list payload had unexpected shape — "
+                "neither Data.Files nor Data is a list."
+            )
+        return tuple(
+            SiblingFile(filename=f["Path"], size=f.get("Size"))
+            for f in files
+            if isinstance(f, dict) and isinstance(f.get("Path"), str)
+            # Only include blobs (not directories). Type=tree means folder.
+            and f.get("Type", "blob") != "tree"
+        )
+    def _fetch_config(
+        self, model_id: str, revision: str, headers: dict[str, str]
+    ) -> dict[str, Any]:
+        url = f"{self._endpoint}{_RAW_PATH.format(model_id=model_id)}"
+        params = {"FilePath": "config.json", "Revision": revision}
+        try:
+            resp = httpx.get(
+                url,
+                headers=headers,
+                params=params,
+                timeout=self._timeout_s,
+                follow_redirects=True,
+            )
+        except (httpx.TimeoutException, httpx.ConnectError) as e:
+            raise SourceUnavailableError(f"config.json fetch failed: {e}") from e
+        self._raise_for_status(resp, model_id, what="config.json")
+        try:
+            parsed: Any = json.loads(resp.text)
+        except json.JSONDecodeError as e:
+            raise SourceUnavailableError(
+                f"config.json is not valid JSON (line {e.lineno} col {e.colno}): {e.msg}"
+            ) from e
+        if not isinstance(parsed, dict):
+            raise SourceUnavailableError(
+                "config.json did not parse to a JSON object."
+            )
+        return parsed
+    def _raise_for_status(
+        self, resp: httpx.Response, model_id: str, what: str
+    ) -> None:
+        if resp.status_code == 200:
+            return
+        if resp.status_code == 404:
+            raise ModelNotFoundError(
+                f"Model '{model_id}' not found on ModelScope ({what})."
+            )
+        if resp.status_code in (401, 403):
+            raise AuthRequiredError(modelscope_auth_error_message(model_id))
+        if resp.status_code == 429:
+            retry = resp.headers.get("Retry-After", "unknown")
+            raise SourceUnavailableError(
+                f"ModelScope rate limit (429). Retry-After: {retry}s. "
+                "Setting MODELSCOPE_API_TOKEN increases your quota."
+            )
+        raise SourceUnavailableError(
+            f"ModelScope {what} returned HTTP {resp.status_code}"
+        )
+def _extract_files(payload: Any) -> list[Any] | None:
+    """Pull the file list out of the wrapped ModelScope envelope.
+    Tolerates two known shapes:
+      A) {Data: {Files: [...]}}      — most common
+      B) {Data: [...]}               — older / list-only endpoints
+    """
+    if not isinstance(payload, dict):
+        return None
+    data = payload.get("Data")
+    if isinstance(data, dict):
+        files = data.get("Files")
+        if isinstance(files, list):
+            return files
+    if isinstance(data, list):
+        return data
+    return None

src/llm_cal/output/__init__.py ADDED Viewed

File without changes

src/llm_cal/output/formatter.py ADDED Viewed

	@@ -0,0 +1,665 @@

+"""Rich-formatted, fully i18n'd output for EvaluationReport.
+Every visible string flows through `common.i18n.t()`. To add another locale,
+add entries to `_MESSAGES` in i18n.py; no changes here needed.
+"""
+from __future__ import annotations
+from typing import Any
+from rich.console import Console
+from rich.panel import Panel
+from rich.table import Table
+from rich.text import Text
+from llm_cal.common.i18n import get_locale, t
+from llm_cal.core.evaluator import EvaluationReport
+from llm_cal.engine_compat.loader import EngineCompatEntry, EngineFlag, EngineSource
+from llm_cal.fleet.planner import FleetRecommendation
+from llm_cal.hardware.loader import GPUDatabase
+from llm_cal.output.labels import AnnotatedValue, Label
+_LABEL_STYLES: dict[Label, str] = {
+    Label.VERIFIED: "bold green",
+    Label.INFERRED: "cyan",
+    Label.ESTIMATED: "yellow",
+    Label.CITED: "blue",
+    Label.UNVERIFIED: "bold yellow",
+    Label.UNKNOWN: "dim red",
+    Label.LLM_OPINION: "magenta",
+}
+def format_tag(av: AnnotatedValue[Any]) -> Text:
+    style = _LABEL_STYLES.get(av.label, "white")
+    display = t(f"label.{av.label.value}")  # localized; falls back to English
+    return Text(f"[{display}]", style=style)
+def _fmt_bytes(n: int) -> str:
+    if n >= 1_000_000_000:
+        return f"{n / 1_000_000_000:.2f} GB"
+    if n >= 1_000_000:
+        return f"{n / 1_000_000:.2f} MB"
+    if n >= 1_000:
+        return f"{n / 1_000:.2f} KB"
+    return f"{n} B"
+def _fmt_params(n: int) -> str:
+    if n >= 1_000_000_000:
+        return f"{n / 1_000_000_000:.2f}B"
+    if n >= 1_000_000:
+        return f"{n / 1_000_000:.2f}M"
+    return str(n)
+def render(report: EvaluationReport, console: Console | None = None) -> None:
+    console = console or Console()
+    console.print()
+    sha_frag = f" @ {report.commit_sha[:7]}" if report.commit_sha else ""
+    console.print(
+        Panel.fit(
+            f"[bold cyan]{report.model_id}[/bold cyan]  "
+            f"[dim]{t('panel.via')} {report.source}{sha_frag}[/dim]",
+            border_style="cyan",
+        )
+    )
+    _render_architecture(report, console)
+    _render_weight(report, console)
+    _render_kv_cache(report, console)
+    _render_engine_compat(report, console)
+    _render_hardware(report, console)
+    _render_fleet(report, console)
+    _render_performance(report, console)
+    _render_command(report, console)
+    _render_label_legend(console)
+def _render_architecture(report: EvaluationReport, console: Console) -> None:
+    p = report.profile
+    table = Table(title=t("section.architecture"), show_header=False, box=None, padding=(0, 2))
+    table.add_column("field", style="dim")
+    table.add_column("value")
+    table.add_column("label")
+    table.add_row(t("arch.model_type"), p.model_type or t("arch.none"), _verified_tag())
+    table.add_row(t("arch.family"), p.family.value, _verified_tag())
+    table.add_row(
+        t("arch.confidence"), p.confidence.value, Text(f"[{p.confidence.value}]", style="magenta")
+    )
+    table.add_row(t("arch.layers"), str(p.num_hidden_layers), _verified_tag())
+    table.add_row(t("arch.hidden_size"), str(p.hidden_size), _verified_tag())
+    table.add_row(t("arch.vocab_size"), f"{p.vocab_size:,}", _verified_tag())
+    if p.attention is not None:
+        table.add_row(
+            t("arch.attention"),
+            t(
+                "arch.attn_summary",
+                variant=p.attention.variant,
+                heads=p.attention.num_heads,
+                kv_heads=p.attention.num_kv_heads,
+                head_dim=p.attention.head_dim,
+            ),
+            _verified_tag(),
+        )
+        if p.attention.compress_ratios:
+            ratios = p.attention.compress_ratios
+            table.add_row(
+                t("arch.compress_ratios"),
+                t(
+                    "arch.compress_ratios_summary",
+                    n=len(ratios),
+                    dense=sum(1 for r in ratios if r == 0),
+                ),
+                _verified_tag(),
+            )
+    if p.moe is not None:
+        table.add_row(
+            t("arch.moe"),
+            t(
+                "arch.moe_summary",
+                routed=p.moe.num_routed_experts,
+                shared=p.moe.num_shared_experts,
+                topk=p.moe.num_experts_per_tok,
+            ),
+            _verified_tag(),
+        )
+    if p.sliding_window:
+        table.add_row(t("arch.sliding_window"), str(p.sliding_window), _verified_tag())
+    if p.position and p.position.max_position_embeddings:
+        table.add_row(
+            t("arch.max_position"),
+            f"{p.position.max_position_embeddings:,}",
+            _verified_tag(),
+        )
+    console.print(table)
+    if p.auxiliary.get("warning"):
+        console.print(f"[red]⚠ {p.auxiliary['warning']}[/red]")
+    if p.auxiliary.get("v0_1_unsupported"):
+        console.print(f"[yellow]⚠ {t('arch.unsupported_state_space')}[/yellow]")
+def _render_weight(report: EvaluationReport, console: Console) -> None:
+    table = Table(title=t("section.weights"), show_header=False, box=None, padding=(0, 2))
+    table.add_column("field", style="dim")
+    table.add_column("value")
+    table.add_column("label")
+    w = report.weight
+    table.add_row(
+        t("weights.safetensors_bytes"),
+        _fmt_bytes(w.total_bytes.value),
+        format_tag(w.total_bytes),
+    )
+    table.add_row(
+        t("weights.params_estimated"),
+        _fmt_params(report.total_params_estimate.value),
+        format_tag(report.total_params_estimate),
+    )
+    if w.bits_per_param is not None:
+        table.add_row(
+            t("weights.bits_per_param"),
+            f"{w.bits_per_param.value:.2f}",
+            format_tag(w.bits_per_param),
+        )
+    table.add_row(
+        t("weights.quant_guess"),
+        str(w.quantization_guess.value),
+        format_tag(w.quantization_guess),
+    )
+    console.print(table)
+    r = report.reconciliation
+    if r.candidates:
+        rec_table = Table(
+            title=t("section.reconciliation"),
+            title_justify="left",
+            show_header=True,
+            header_style="dim",
+            box=None,
+            padding=(0, 2),
+        )
+        rec_table.add_column(t("recon.scheme"))
+        rec_table.add_column(t("recon.predicted"), justify="right")
+        rec_table.add_column(t("recon.delta"), justify="right")
+        rec_table.add_column(t("recon.error_pct"), justify="right")
+        for c in r.candidates[:6]:
+            direction = t("recon.over") if c.delta_bytes > 0 else t("recon.under")
+            rec_table.add_row(
+                c.scheme,
+                _fmt_bytes(c.predicted_bytes),
+                f"{_fmt_bytes(abs(c.delta_bytes))} {direction}",
+                f"{c.relative_error * 100:.1f}%",
+            )
+        console.print(rec_table)
+        console.print(f"[bold]{t('recon.best')}[/bold] {r.best.value}  {format_tag(r.best)}")
+def _render_kv_cache(report: EvaluationReport, console: Console) -> None:
+    if not report.kv_cache_by_context:
+        return
+    table = Table(
+        title=t("section.kv_cache"),
+        title_justify="left",
+        show_header=True,
+        header_style="dim",
+        box=None,
+        padding=(0, 2),
+    )
+    table.add_column(t("kv.context"))
+    table.add_column(t("kv.kv_cache"), justify="right")
+    table.add_column(t("kv.label"))
+    tokens_word = t("kv.tokens")
+    for ctx, av in report.kv_cache_by_context.items():
+        table.add_row(
+            f"{ctx:,} {tokens_word}",
+            _fmt_bytes(av.value),
+            format_tag(av),
+        )
+    console.print(table)
+def _render_engine_compat(report: EvaluationReport, console: Console) -> None:
+    m = report.engine_match
+    if m is None:
+        console.print()
+        console.print(
+            f"[dim]{t('section.engine_compat')}:[/dim] [yellow]{t('engine.no_match')}[/yellow]"
+        )
+        return
+    table = Table(
+        title=f"{t('section.engine_compat')} — {m.engine}",
+        show_header=False,
+        box=None,
+        padding=(0, 2),
+    )
+    table.add_column("field", style="dim")
+    table.add_column("value")
+    table.add_column("label")
+    verif_label = _verif_label(m)
+    table.add_row(t("engine.version_spec"), m.version_spec, Text(""))
+    table.add_row(t("engine.support"), m.support, verif_label)
+    table.add_row(t("engine.verification"), m.verification_level, verif_label)
+    if m.required_flags:
+        lines = [_fmt_flag(f) for f in m.required_flags]
+        table.add_row(t("engine.required_flags"), "\n".join(lines), Text(""))
+    if m.optional_flags:
+        lines = [_fmt_flag(f) for f in m.optional_flags]
+        table.add_row(t("engine.optional_flags"), "\n".join(lines), Text(""))
+    caveats = m.caveats_zh if get_locale() == "zh" else m.caveats_en
+    if caveats:
+        table.add_row(t("engine.caveats"), "\n".join(f"• {c}" for c in caveats), Text(""))
+    if m.sources:
+        source_lines = [_fmt_source(s) for s in m.sources]
+        table.add_row(t("engine.sources"), "\n".join(source_lines), Text(""))
+    console.print(table)
+def _render_hardware(report: EvaluationReport, console: Console) -> None:
+    console.print()
+    if report.gpu_spec is None:
+        msg = report.gpu_error or f"Unknown GPU '{report.gpu}'"
+        console.print(f"[bold red]{t('section.hardware')}:[/bold red] [red]{msg}[/red]")
+        return
+    spec = report.gpu_spec
+    locale = get_locale()
+    table = Table(
+        title=f"{t('section.hardware')} — {spec.id}",
+        show_header=False,
+        box=None,
+        padding=(0, 2),
+    )
+    table.add_column("field", style="dim")
+    table.add_column("value")
+    table.add_row(t("hw.memory"), f"{spec.memory_gb} GB HBM")
+    table.add_row(t("hw.nvlink_bandwidth"), f"{spec.nvlink_bandwidth_gbps} GB/s")
+    table.add_row(t("hw.fp16_tflops"), f"{spec.fp16_tflops:.0f} TFLOPS")
+    table.add_row(t("hw.fp8_support"), t("hw.bool_yes") if spec.fp8_support else t("hw.bool_no"))
+    table.add_row(t("hw.fp4_support"), t("hw.bool_yes") if spec.fp4_support else t("hw.bool_no"))
+    notes = spec.localized_notes(locale)
+    if notes:
+        table.add_row(t("hw.notes"), notes)
+    if spec.spec_source:
+        table.add_row(t("hw.spec_source"), spec.spec_source)
+    console.print(table)
+def _render_fleet(report: EvaluationReport, console: Console) -> None:
+    f = report.fleet
+    if f is None:
+        if report.gpu_spec is None:
+            return  # hardware section already surfaced the error
+        console.print(f"[dim]{t('fleet.gpu_spec_unknown')}[/dim]")
+        return
+    # Decide which context lengths to surface as concurrency columns.
+    ctx_cols = _select_concurrency_columns(f)
+    table = Table(
+        title=f"{t('section.fleet')} — {report.gpu_spec.id if report.gpu_spec else report.gpu}",
+        title_justify="left",
+        show_header=True,
+        header_style="dim",
+        box=None,
+        padding=(0, 2),
+    )
+    table.add_column(t("fleet.col.tier"))
+    table.add_column(t("fleet.col.gpus"), justify="right")
+    table.add_column(t("fleet.col.weight_per_gpu"), justify="right")
+    table.add_column(t("fleet.col.headroom_per_gpu"), justify="right")
+    for ctx in ctx_cols:
+        table.add_column(
+            t("fleet.col.concurrent_at_ctx", ctx=_fmt_ctx(ctx)),
+            justify="right",
+        )
+    for opt in f.options:
+        headroom = opt.usable_bytes_per_gpu - opt.weight_bytes_per_gpu
+        label_tier = t(f"fleet.tier.{opt.tier}")
+        marker = " ★" if opt.tier == f.best_tier else ""
+        row_style = None if opt.fits else "dim red"
+        conc_map = dict(opt.max_concurrent_by_context)
+        row = [
+            f"{label_tier}{marker}",
+            str(opt.gpu_count),
+            _fmt_bytes(opt.weight_bytes_per_gpu),
+            _fmt_bytes(headroom) if headroom > 0 else "—",
+        ]
+        for ctx in ctx_cols:
+            n = conc_map.get(ctx, 0)
+            row.append(f"~{n}" if n > 0 else "✗")
+        table.add_row(*row, style=row_style)
+    console.print(table)
+    locale = get_locale()
+    note = f.constraint_note_zh if locale == "zh" else f.constraint_note_en
+    console.print(f"[dim]{t('fleet.constraint')} {note}[/dim]")
+    console.print(f"[dim]★ {t('fleet.best_marker')}[/dim]")
+def _select_concurrency_columns(f: FleetRecommendation) -> list[int]:
+    """Pick which context lengths become concurrency columns in the fleet table.
+    Rule: always include 128K if the model supports it; additionally include the
+    model's max context if it's larger than 128K. For shorter-context models,
+    fall back to 32K or whatever the max is.
+    """
+    all_ctxs: set[int] = set()
+    for opt in f.options:
+        for ctx, _ in opt.max_concurrent_by_context:
+            all_ctxs.add(ctx)
+    if not all_ctxs:
+        return []
+    picks: list[int] = []
+    if 131_072 in all_ctxs:
+        picks.append(131_072)
+    max_ctx = max(all_ctxs)
+    if max_ctx > 131_072 and max_ctx not in picks:
+        picks.append(max_ctx)
+    if not picks:
+        picks.append(32_768 if 32_768 in all_ctxs else max_ctx)
+    return picks
+def _fmt_ctx(ctx_tokens: int) -> str:
+    if ctx_tokens >= 1_000_000:
+        if ctx_tokens % 1_000_000 == 0:
+            return f"{ctx_tokens // 1_000_000}M"
+        return f"{ctx_tokens / 1_000_000:.1f}M"
+    if ctx_tokens >= 1024:
+        return f"{ctx_tokens // 1024}K"
+    return str(ctx_tokens)
+def _render_performance(report: EvaluationReport, console: Console) -> None:
+    if (
+        report.prefill is None
+        or report.decode is None
+        or report.concurrency is None
+        or report.perf_input_tokens is None
+        or report.perf_target_tokens_per_sec is None
+    ):
+        return
+    console.print()
+    # Assumption banner — surfaces the utilization factors, SLA, and
+    # degradation factor. Every number in the performance section depends
+    # on these.
+    assumptions = t(
+        "perf.assumptions_note",
+        input_tokens=report.perf_input_tokens,
+        output_tokens=report.perf_output_tokens,
+        target_tps=report.perf_target_tokens_per_sec,
+        prefill_util=report.prefill.utilization,
+        decode_util=report.decode.bw_utilization,
+        degradation=report.concurrency.degradation_factor,
+    )
+    console.print(f"[dim italic]{assumptions}[/dim italic]")
+    table = Table(
+        title=t("section.performance"),
+        title_justify="left",
+        show_header=False,
+        box=None,
+        padding=(0, 2),
+    )
+    table.add_column("field", style="dim")
+    table.add_column("value")
+    table.add_column("label")
+    p = report.prefill
+    d = report.decode
+    c = report.concurrency
+    table.add_row(
+        t("perf.prefill_latency"),
+        f"{p.latency_ms.value:.1f} ms",
+        format_tag(p.latency_ms),
+    )
+    table.add_row(
+        t("perf.decode_throughput_per_gpu"),
+        f"{d.per_gpu_tokens_per_sec.value:.1f} tok/s",
+        format_tag(d.per_gpu_tokens_per_sec),
+    )
+    table.add_row(
+        t("perf.decode_throughput_cluster"),
+        f"{d.cluster_tokens_per_sec.value:.1f} tok/s",
+        format_tag(d.cluster_tokens_per_sec),
+    )
+    if d.moe_active_tokens_per_sec is not None:
+        table.add_row(
+            t("perf.decode_moe_active_optimistic"),
+            f"{d.moe_active_tokens_per_sec.value:.1f} tok/s",
+            format_tag(d.moe_active_tokens_per_sec),
+        )
+    table.add_row(
+        t("perf.k_bound"),
+        str(c.k_bound.value),
+        format_tag(c.k_bound),
+    )
+    table.add_row(
+        t("perf.l_bound"),
+        str(c.l_bound.value),
+        format_tag(c.l_bound),
+    )
+    table.add_row(
+        t("perf.max_concurrent"),
+        str(c.max_concurrent.value),
+        format_tag(c.max_concurrent),
+    )
+    bottleneck_label = t(f"perf.bottleneck.{c.bottleneck}")
+    locale = get_locale()
+    reason = c.bottleneck_reason_zh if locale == "zh" else c.bottleneck_reason_en
+    table.add_row(
+        t("perf.bottleneck"),
+        f"{bottleneck_label} — {reason}",
+        Text(""),
+    )
+    console.print(table)
+    # Always show a short optimization list. Rules are currently static but
+    # future versions can pick per bottleneck type.
+    console.print(f"[bold]{t('perf.optimization.header')}:[/bold]")
+    for key in (
+        "perf.opt.quantize_int4",
+        "perf.opt.relax_sla",
+        "perf.opt.kv_fp8",
+        "perf.opt.moe_offload",
+    ):
+        console.print(f"  • {t(key)}")
+def _render_command(report: EvaluationReport, console: Console) -> None:
+    if not report.generated_command or report.fleet is None:
+        return
+    # Figure out which tier we emitted the command for.
+    best_tier_opt = next(
+        (o for o in report.fleet.options if o.tier == report.fleet.best_tier),
+        report.fleet.options[0],
+    )
+    tier_label = t(f"fleet.tier.{best_tier_opt.tier}")
+    header_note = t("command.tier_note", tier=tier_label, gpus=best_tier_opt.gpu_count)
+    console.print()
+    console.print(
+        Panel(
+            report.generated_command,
+            title=f"{t('section.command')} — {header_note}",
+            title_align="left",
+            border_style="green",
+        )
+    )
+def _render_label_legend(console: Console) -> None:
+    legend = Text()
+    legend.append(f"{t('section.labels')} ", style="dim")
+    for label in Label:
+        display = t(f"label.{label.value}")
+        legend.append(f"[{display}] ", style=_LABEL_STYLES.get(label, "white"))
+    console.print(legend)
+def _verified_tag() -> Text:
+    return Text(f"[{t('label.verified')}]", style=_LABEL_STYLES[Label.VERIFIED])
+def render_llm_review(result: Any, console: Console | None = None) -> None:
+    """Render --llm-review block. Accepts an LLMReviewResult.
+    Failure is non-fatal — shows setup hint and continues.
+    """
+    console = console or Console()
+    console.print()
+    console.print(Panel.fit(t("section.llm_review"), border_style="magenta"))
+    if not result.ok:
+        msg = t("llm_review.unavailable", error=result.error or "unknown")
+        console.print(f"[yellow]{msg}[/yellow]")
+        console.print(f"[dim]{t('llm_review.setup_hint')}[/dim]")
+        return
+    # Disclaimer first — make it visually distinctive so users don't confuse
+    # LLM opinion with the tool's own output.
+    disclaimer = t("llm_review.disclaimer", model=result.model, base_url=result.base_url)
+    console.print(f"[bold yellow]{disclaimer}[/bold yellow]")
+    console.print()
+    # The actual review, prefixed with the [llm-opinion] tag so users see
+    # it's tagged too.
+    tag_style = _LABEL_STYLES[Label.LLM_OPINION]
+    tag_display = t(f"label.{Label.LLM_OPINION.value}")
+    console.print(f"[{tag_style}][{tag_display}][/{tag_style}]")
+    # Print content verbatim (LLM output is markdown-ish; let it through).
+    console.print(result.content or "")
+def render_explain(entries: list[Any], console: Console | None = None) -> None:
+    """Render `--explain` block: full derivation trace for each number.
+    `entries` is a list of `core.explain.ExplainEntry`.
+    """
+    console = console or Console()
+    console.print()
+    console.print(Panel.fit(t("section.explain"), border_style="magenta"))
+    console.print(f"[dim italic]{t('explain.intro')}[/dim italic]")
+    console.print()
+    for entry in entries:
+        # Title bar per entry
+        console.print(Panel.fit(f"[bold]{entry.heading}[/bold]", border_style="cyan"))
+        # Formula (monospace)
+        console.print(f"[bold]{t('explain.formula')}:[/bold]")
+        for line in entry.formula.splitlines():
+            console.print(f"  [magenta]{line}[/magenta]")
+        # Inputs
+        if entry.inputs:
+            console.print(f"[bold]{t('explain.inputs')}:[/bold]")
+            for inp in entry.inputs:
+                note = f" [dim]({inp.note})[/dim]" if inp.note else ""
+                console.print(
+                    f"  [cyan]{inp.name}[/cyan] = {inp.value}  [dim]{inp.label}[/dim]{note}"
+                )
+        # Steps
+        if entry.steps:
+            console.print(f"[bold]{t('explain.steps')}:[/bold]")
+            for step in entry.steps:
+                for line in step.splitlines():
+                    console.print(f"  {line}")
+        # Result
+        console.print(f"[bold]{t('explain.result')}:[/bold]  {entry.result}")
+        # Source + methodology anchor
+        if entry.source:
+            console.print(f"[bold]{t('explain.source')}:[/bold]  {entry.source}")
+        if entry.methodology_anchor:
+            console.print(
+                f"[dim]{t('explain.see_also')}: docs/methodology.md{entry.methodology_anchor}[/dim]"
+            )
+        console.print()
+def render_gpu_list(db: GPUDatabase, console: Console | None = None) -> None:
+    """Print the supported-GPU table. Invoked by `llm-cal --list-gpus`."""
+    console = console or Console()
+    locale = get_locale()
+    table = Table(
+        title=t("gpus.list.title"),
+        title_justify="left",
+        show_header=True,
+        header_style="dim",
+        box=None,
+        padding=(0, 2),
+    )
+    table.add_column(t("gpus.col.id"))
+    table.add_column(t("gpus.col.memory"), justify="right")
+    table.add_column(t("gpus.col.nvlink"), justify="right")
+    table.add_column(t("gpus.col.fp16"), justify="right")
+    table.add_column(t("gpus.col.fp8"), justify="center")
+    table.add_column(t("gpus.col.fp4"), justify="center")
+    table.add_column(t("gpus.col.aliases"))
+    yes = t("hw.bool_yes")
+    no = t("hw.bool_no")
+    # Preserve YAML insertion order (vendors are grouped there).
+    for spec in db.gpus:
+        aliases_str = ", ".join(spec.aliases) if spec.aliases else "—"
+        nvlink_str = f"{spec.nvlink_bandwidth_gbps} GB/s" if spec.nvlink_bandwidth_gbps else "—"
+        table.add_row(
+            spec.id,
+            f"{spec.memory_gb} GB",
+            nvlink_str,
+            f"{spec.fp16_tflops:.0f}",
+            yes if spec.fp8_support else no,
+            yes if spec.fp4_support else no,
+            aliases_str,
+        )
+    console.print(table)
+    console.print(f"[dim]{t('gpus.total', count=len(db.gpus))}[/dim]")
+    _ = locale  # suppress unused var warn until we add locale-dependent notes column
+def _verif_label(entry: EngineCompatEntry) -> Text:
+    """Engine compat rows use the same label vocabulary as AnnotatedValue."""
+    label = {
+        "verified": Label.VERIFIED,
+        "cited": Label.CITED,
+        "unverified": Label.UNVERIFIED,
+    }.get(entry.verification_level, Label.UNKNOWN)
+    return Text(f"[{t(f'label.{label.value}')}]", style=_LABEL_STYLES.get(label, "white"))
+def _fmt_flag(f: EngineFlag) -> str:
+    if f.value is None:
+        return f.flag
+    return f"{f.flag} {f.value}"
+def _fmt_source(s: EngineSource) -> str:
+    label = t(f"source.{s.type}")
+    if s.type == "tested":
+        return f"[{label}] {s.tester} @ {s.hardware} ({s.date})"
+    if s.url:
+        captured = f" ({t('source.captured_on')} {s.captured_date})" if s.captured_date else ""
+        return f"[{label}] {s.url}{captured}"
+    return f"[{label}]"

src/llm_cal/output/labels.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""6-level label discipline — the soul of the tool.
+Every number in the output must be wrapped in `AnnotatedValue` so users always know
+where a value came from. Using `StrEnum` (not bare strings) means typos are caught by
+mypy/ruff, not by users.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from enum import StrEnum
+from typing import Generic, TypeVar
+class Label(StrEnum):
+    VERIFIED = "verified"
+    INFERRED = "inferred"
+    ESTIMATED = "estimated"
+    CITED = "cited"
+    UNVERIFIED = "unverified"
+    UNKNOWN = "unknown"
+    # Experimental opt-in 7th level. Populated only when --llm-review is used.
+    # Never overrides the first 6 — it's an external second opinion, not truth.
+    LLM_OPINION = "llm-opinion"
+T = TypeVar("T")
+@dataclass(frozen=True)
+class AnnotatedValue(Generic[T]):
+    """A value paired with provenance metadata.
+    Examples:
+        AnnotatedValue(160_300_000_000, Label.VERIFIED, source="HF model_info.siblings")
+        AnnotatedValue(4.52, Label.INFERRED, source="160.3 GB / 284B params")
+        AnnotatedValue(2_600_000_000, Label.ESTIMATED,
+                       source="compress_ratios=[0,0,4,128,...] at 128K ctx")
+    """
+    value: T
+    label: Label
+    source: str | None = None
+    def render_tag(self) -> str:
+        return f"[{self.label.value}]"

src/llm_cal/performance/__init__.py ADDED Viewed

File without changes

src/llm_cal/performance/compute.py ADDED Viewed

	@@ -0,0 +1,233 @@

+"""Performance modeling for prefill latency and decode throughput.
+FORMULAS — with sources. See docs/methodology.md for the full audit.
+Prefill (compute-bound):
+    FLOPs = 2 × params × input_tokens
+    latency = FLOPs / (peak_TFLOPS × num_gpus × utilization × 1e12)
+    Source: Kaplan et al. 2020, "Scaling Laws for Neural Language Models".
+    The "2" factor is the forward-pass cost per param per token, a standard
+    approximation in transformer inference literature.
+Decode (memory-bandwidth-bound):
+    per_token_time = weight_bytes_per_gpu / (memory_bandwidth × utilization)
+    tokens_per_second = memory_bandwidth × utilization / weight_bytes_per_gpu
+    Source: Kwon et al. SOSP 2023 "Efficient Memory Management for Large
+    Language Model Serving with PagedAttention"; NVIDIA "Mastering LLM
+    Techniques: Inference Optimization" (2023 technical blog).
+UTILIZATION FACTORS (all empirical, ALL user-overridable):
+  - Prefill 40% — midpoint of vLLM-reported 30-50% MFU on H100
+  - Decode BW 50% — midpoint of NVIDIA/vLLM-reported 40-65% achieved bandwidth
+  - Cluster comm 90% — typical NCCL AllReduce efficiency at TP=8 on NVLink
+  - Concurrency degradation 1.0 (no degradation by default)
+    This is the most uncertain factor. Prior versions defaulted to 1.5
+    (borrowed from an LLM-generated report), which was NOT from a primary
+    source. v0.1 defaults to 1.0 (honest baseline) and exposes the knob
+    so users can dial in whatever their engine actually achieves.
+MoE "active" vs "total":
+    Strictly, MoE decode only reads the active experts per token. The
+    ratio used here is a rough approximation:
+        active_ratio ≈ (experts_per_tok + shared_experts) / (routed + shared)
+    This UNDERESTIMATES active weight because attention + embeddings are
+    always active (not just experts). For a more accurate number, use the
+    model card's stated "total / active" figure if available. The
+    "active-only" throughput is labeled "optimistic" for this reason.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from llm_cal.architecture.profile import ArchitectureProfile
+from llm_cal.hardware.loader import GPUSpec
+from llm_cal.output.labels import AnnotatedValue, Label
+# Empirical defaults. All user-overridable via CLI.
+DEFAULT_PREFILL_UTILIZATION = 0.40
+DEFAULT_DECODE_BW_UTILIZATION = 0.50
+DEFAULT_CLUSTER_COMM_EFFICIENCY = 0.90
+# Honest baseline. Previously 1.5, borrowed from an LLM-generated report —
+# that had no primary source, so we reset to 1.0. Users who observe actual
+# degradation on their engine should dial this up via CLI.
+DEFAULT_CONCURRENCY_DEGRADATION = 1.0
+@dataclass(frozen=True)
+class PrefillEstimate:
+    total_flops: AnnotatedValue[int]  # [estimated] 2 * params * input_tokens
+    peak_effective_tflops: AnnotatedValue[float]  # TFLOPS × utilization
+    latency_ms: AnnotatedValue[float]
+    utilization: float  # the factor used (for provenance)
+@dataclass(frozen=True)
+class DecodeEstimate:
+    active_weight_bytes_per_gpu: AnnotatedValue[int]
+    per_gpu_tokens_per_sec: AnnotatedValue[float]
+    cluster_tokens_per_sec: AnnotatedValue[float]  # after comm efficiency
+    bw_utilization: float
+    cluster_comm_efficiency: float
+    moe_active_weight_bytes_per_gpu: AnnotatedValue[int] | None = None
+    moe_active_tokens_per_sec: AnnotatedValue[float] | None = None
+def estimate_prefill(
+    profile: ArchitectureProfile,
+    total_params: int,
+    gpu: GPUSpec,
+    num_gpus: int,
+    input_tokens: int,
+    utilization: float = DEFAULT_PREFILL_UTILIZATION,
+) -> PrefillEstimate:
+    """Estimate single-request prefill latency.
+    Based on compute: FLOPs = 2 × params × tokens; latency = FLOPs / effective_FLOPS.
+    """
+    flops = 2 * total_params * input_tokens
+    # TP distributes compute, so aggregate TFLOPS = num_gpus × per-card × util
+    aggregate_tflops = gpu.fp16_tflops * num_gpus * utilization
+    # Guard against zero
+    if aggregate_tflops <= 0 or total_params <= 0 or input_tokens <= 0:
+        return PrefillEstimate(
+            total_flops=AnnotatedValue(0, Label.UNKNOWN, source="insufficient inputs"),
+            peak_effective_tflops=AnnotatedValue(0.0, Label.UNKNOWN),
+            latency_ms=AnnotatedValue(0.0, Label.UNKNOWN),
+            utilization=utilization,
+        )
+    latency_s = flops / (aggregate_tflops * 1e12)
+    latency_ms = latency_s * 1000.0
+    return PrefillEstimate(
+        total_flops=AnnotatedValue(
+            flops,
+            Label.ESTIMATED,
+            source=f"2 × {total_params:,} params × {input_tokens:,} tokens",
+        ),
+        peak_effective_tflops=AnnotatedValue(
+            aggregate_tflops,
+            Label.ESTIMATED,
+            source=f"{gpu.fp16_tflops} × {num_gpus} GPUs × {utilization:.0%} util",
+        ),
+        latency_ms=AnnotatedValue(
+            latency_ms,
+            Label.ESTIMATED,
+            source=(f"{flops:.2e} FLOPs / ({aggregate_tflops:.1f} effective TFLOPS × 1e12)"),
+        ),
+        utilization=utilization,
+    )
+def _nvlink_efficiency(gpu: GPUSpec, num_gpus: int) -> float:
+    """Multiplier on cluster comm efficiency reflecting NVLink bandwidth.
+    Single-GPU has no TP all-reduce, so no penalty. H100 / B200 / H200 / A100-
+    SXM4 with full NVLink (>=900 GB/s aggregate, dropped to 600 for A100) get
+    ~1.0. Restricted-NVLink variants (H800: 400 GB/s, half of H100) pay ~8%.
+    PCIe-only cards (L40S, RTX) with no NVLink pay 20%.
+    """
+    if num_gpus <= 1:
+        return 1.0
+    nvlink = gpu.nvlink_bandwidth_gbps or 0
+    if nvlink >= 900:
+        return 1.0
+    if nvlink <= 0:
+        return 0.80
+    return 0.85 + 0.15 * (nvlink / 900.0)
+def estimate_decode(
+    profile: ArchitectureProfile,
+    total_weight_bytes: int,
+    gpu: GPUSpec,
+    num_gpus: int,
+    bw_utilization: float = DEFAULT_DECODE_BW_UTILIZATION,
+    cluster_comm_efficiency: float = DEFAULT_CLUSTER_COMM_EFFICIENCY,
+    moe_active_params_ratio: float | None = None,
+) -> DecodeEstimate:
+    """Estimate decode tokens/second.
+    Decode is memory-bandwidth-bound: per-token time = weight_bytes / bw.
+    Under TP, weights split across ranks, so per-GPU weight bytes = total / N.
+    If the model is MoE and moe_active_params_ratio is given (e.g. 0.3 for
+    active/total), we ALSO report an optimistic "active only" throughput.
+    """
+    if gpu.memory_bandwidth_gbps is None or gpu.memory_bandwidth_gbps <= 0:
+        _unknown = AnnotatedValue(
+            0, Label.UNKNOWN, source="GPU memory_bandwidth_gbps not in database"
+        )
+        _unknown_f = AnnotatedValue(
+            0.0, Label.UNKNOWN, source="GPU memory_bandwidth_gbps not in database"
+        )
+        return DecodeEstimate(
+            active_weight_bytes_per_gpu=_unknown,
+            per_gpu_tokens_per_sec=_unknown_f,
+            cluster_tokens_per_sec=_unknown_f,
+            bw_utilization=bw_utilization,
+            cluster_comm_efficiency=cluster_comm_efficiency,
+        )
+    bw_bytes_per_s = gpu.memory_bandwidth_gbps * 1e9  # GB/s → bytes/s
+    effective_bw = bw_bytes_per_s * bw_utilization
+    weight_per_gpu = max(1, total_weight_bytes // num_gpus)
+    per_gpu_tps = effective_bw / weight_per_gpu
+    # Cluster-level: per-GPU × N × comm_efficiency × NVLink-aware penalty.
+    # NVLink penalty captures TP all-reduce overhead on cards with restricted
+    # interconnect (H800, PCIe-only). Single-GPU is unaffected.
+    nvlink_eff = _nvlink_efficiency(gpu, num_gpus)
+    effective_comm_eff = cluster_comm_efficiency * nvlink_eff
+    cluster_tps = per_gpu_tps * num_gpus * effective_comm_eff
+    # MoE active-only optimistic view
+    moe_active_weight: AnnotatedValue[int] | None = None
+    moe_active_tps: AnnotatedValue[float] | None = None
+    if profile.is_moe and moe_active_params_ratio is not None and moe_active_params_ratio > 0:
+        active_bytes = int(weight_per_gpu * moe_active_params_ratio)
+        moe_active_weight = AnnotatedValue(
+            active_bytes,
+            Label.ESTIMATED,
+            source=f"{weight_per_gpu:,} × {moe_active_params_ratio:.3f} (active/total ratio)",
+        )
+        if active_bytes > 0:
+            active_per_gpu_tps = effective_bw / active_bytes
+            active_cluster_tps = active_per_gpu_tps * num_gpus * effective_comm_eff
+            moe_active_tps = AnnotatedValue(
+                active_cluster_tps,
+                Label.ESTIMATED,
+                source=(
+                    f"optimistic MoE active-only: effective_bw / {active_bytes:,} × "
+                    f"{num_gpus} × {effective_comm_eff:.3f}"
+                ),
+            )
+    return DecodeEstimate(
+        active_weight_bytes_per_gpu=AnnotatedValue(
+            weight_per_gpu,
+            Label.ESTIMATED,
+            source=f"{total_weight_bytes:,} bytes / {num_gpus} TP ranks",
+        ),
+        per_gpu_tokens_per_sec=AnnotatedValue(
+            per_gpu_tps,
+            Label.ESTIMATED,
+            source=(
+                f"{gpu.memory_bandwidth_gbps} GB/s × {bw_utilization:.0%} util / "
+                f"{weight_per_gpu:,} weight bytes"
+            ),
+        ),
+        cluster_tokens_per_sec=AnnotatedValue(
+            cluster_tps,
+            Label.ESTIMATED,
+            source=(
+                f"per-GPU × {num_gpus} GPUs × {cluster_comm_efficiency:.0%} comm × "
+                f"{nvlink_eff:.3f} NVLink penalty (NVLink={gpu.nvlink_bandwidth_gbps or 0} GB/s)"
+            ),
+        ),
+        bw_utilization=bw_utilization,
+        cluster_comm_efficiency=cluster_comm_efficiency,
+        moe_active_weight_bytes_per_gpu=moe_active_weight,
+        moe_active_tokens_per_sec=moe_active_tps,
+    )

src/llm_cal/performance/concurrency.py ADDED Viewed

	@@ -0,0 +1,132 @@

+"""Dual-bound concurrency analysis + bottleneck classification.
+Models two concurrency ceilings:
+  K = memory-capacity bound
+      (usable GPU memory ÷ per-request KV cache)
+  L = compute/bandwidth bound at a given SLA
+      (cluster decode throughput ÷ target per-user tokens/sec ÷ degradation)
+Max concurrent = min(K, L). Whichever is smaller names the bottleneck.
+"""
+from __future__ import annotations
+import math
+from dataclasses import dataclass
+from typing import Literal
+from llm_cal.output.labels import AnnotatedValue, Label
+from llm_cal.performance.compute import (
+    DEFAULT_CONCURRENCY_DEGRADATION,
+    DecodeEstimate,
+)
+Bottleneck = Literal[
+    "memory_capacity",
+    "memory_bandwidth",
+    "compute",
+    "insufficient_data",
+]
+@dataclass(frozen=True)
+class ConcurrencyAnalysis:
+    # K bound
+    k_bound: AnnotatedValue[int]
+    k_source_headroom_bytes: int
+    k_source_kv_per_req_bytes: int
+    # L bound
+    l_bound: AnnotatedValue[int]
+    target_tokens_per_sec: float
+    degradation_factor: float
+    # Verdict
+    max_concurrent: AnnotatedValue[int]
+    bottleneck: Bottleneck
+    bottleneck_reason_en: str
+    bottleneck_reason_zh: str
+def analyze(
+    *,
+    cluster_headroom_bytes: int,  # total KV headroom across all GPUs at ref context
+    kv_bytes_per_request: int,  # single-request KV cache at ref context
+    decode: DecodeEstimate,
+    target_tokens_per_sec: float,
+    degradation: float = DEFAULT_CONCURRENCY_DEGRADATION,
+) -> ConcurrencyAnalysis:
+    """Compute K and L bounds and pick the tighter one.
+    `cluster_headroom_bytes` and `kv_bytes_per_request` should be pre-adjusted
+    for TP sharding (see fleet planner for the same rule).
+    """
+    # K: how many requests fit in KV memory
+    if kv_bytes_per_request <= 0:
+        k = 0
+        k_label = Label.UNKNOWN
+        k_source = "KV cache per request is zero or unknown"
+    else:
+        k = max(0, math.floor(cluster_headroom_bytes / kv_bytes_per_request))
+        k_label = Label.ESTIMATED
+        k_source = (
+            f"{cluster_headroom_bytes:,} bytes headroom / "
+            f"{kv_bytes_per_request:,} bytes per request"
+        )
+    # L: how many concurrent users can maintain target tokens/sec
+    cluster_tps = decode.cluster_tokens_per_sec.value
+    if cluster_tps <= 0 or target_tokens_per_sec <= 0 or degradation <= 0:
+        l_bound = 0
+        l_label = Label.UNKNOWN
+        l_source = "cluster throughput or target is zero / unknown"
+    else:
+        l_bound = max(0, math.floor(cluster_tps / target_tokens_per_sec / degradation))
+        l_label = Label.ESTIMATED
+        l_source = (
+            f"{cluster_tps:.1f} tok/s cluster / "
+            f"{target_tokens_per_sec:.1f} target / {degradation:.2f} degradation"
+        )
+    # Pick the tighter bound
+    if k == 0 and l_bound == 0:
+        max_n = 0
+        bottleneck: Bottleneck = "insufficient_data"
+        reason_en = "Both K and L unknown — cannot conclude."
+        reason_zh = "K 和 L 均未知，无法得出结论。"
+    elif k <= l_bound:
+        max_n = k
+        bottleneck = "memory_capacity"
+        reason_en = (
+            f"K ({k}) ≤ L ({l_bound}) → memory-capacity bound. "
+            "KV cache exhausts GPU headroom before throughput SLA does."
+        )
+        reason_zh = (
+            f"K ({k}) ≤ L ({l_bound}) → 显存容量瓶颈。先达到 KV cache 容量上限，才到吞吐目标。"
+        )
+    else:
+        max_n = l_bound
+        # Whether it's "compute" or "bandwidth" depends on where decode is bound.
+        # For v0.1 we just say "memory bandwidth / compute" since decode is
+        # bw-bound by default and the two share the same formula output.
+        bottleneck = "memory_bandwidth"
+        reason_en = (
+            f"L ({l_bound}) < K ({k}) → memory-bandwidth / compute bound. "
+            "Cluster can't sustain target tok/s per user at this concurrency."
+        )
+        reason_zh = f"L ({l_bound}) < K ({k}) → 带宽/算力瓶颈。集群在此并发下无法维持目标 tok/s。"
+    return ConcurrencyAnalysis(
+        k_bound=AnnotatedValue(k, k_label, source=k_source),
+        k_source_headroom_bytes=cluster_headroom_bytes,
+        k_source_kv_per_req_bytes=kv_bytes_per_request,
+        l_bound=AnnotatedValue(l_bound, l_label, source=l_source),
+        target_tokens_per_sec=target_tokens_per_sec,
+        degradation_factor=degradation,
+        max_concurrent=AnnotatedValue(
+            max_n,
+            Label.ESTIMATED if max_n > 0 else Label.UNKNOWN,
+            source=f"min(K={k}, L={l_bound})",
+        ),
+        bottleneck=bottleneck,
+        bottleneck_reason_en=reason_en,
+        bottleneck_reason_zh=reason_zh,
+    )

src/llm_cal/weight_analyzer/__init__.py ADDED Viewed

	@@ -0,0 +1,146 @@

+"""Weight analyzer — observed bytes + inferred quantization scheme.
+Rules:
+- `[verified]` — directly from HF/ModelScope API (sum of siblings[].size). Nothing else.
+- `[inferred]` — any derivation, including bits/param and quantization guess.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Literal
+from llm_cal.model_source.base import SiblingFile
+from llm_cal.output.labels import AnnotatedValue, Label
+if TYPE_CHECKING:
+    from llm_cal.weight_analyzer.fingerprint import QuantFingerprint
+# Known byte-per-param values. bits/param = bpp * 8.
+QuantizationScheme = Literal[
+    "FP16",
+    "BF16",
+    "FP8",
+    "INT8",
+    "FP4_FP8_MIXED",  # DeepSeek-V4-Flash style
+    "INT4",
+    "GPTQ_INT4",
+    "AWQ_INT4",
+    "UNKNOWN",
+]
+# Rough bytes-per-param anchor points. Used by reconciler.
+_QUANT_BPP: dict[QuantizationScheme, float] = {
+    "FP16": 2.00,
+    "BF16": 2.00,
+    "FP8": 1.00,
+    "INT8": 1.00,
+    "FP4_FP8_MIXED": 0.55,  # DeepSeek V4 empirical (~4.5 bits/param)
+    "INT4": 0.50,
+    "GPTQ_INT4": 0.55,  # +scale tensors overhead
+    "AWQ_INT4": 0.55,
+    "UNKNOWN": 0.0,
+}
+@dataclass(frozen=True)
+class WeightReport:
+    """Everything the weight analyzer can determine from files + params."""
+    total_bytes: AnnotatedValue[int]  # [verified]
+    bits_per_param: AnnotatedValue[float] | None  # [inferred]
+    quantization_guess: AnnotatedValue[QuantizationScheme]  # [inferred]
+def _safetensors_total_bytes(siblings: tuple[SiblingFile, ...]) -> int:
+    """Sum all *.safetensors file sizes. Ignores config, tokenizer, etc."""
+    return sum((s.size or 0) for s in siblings if s.filename.endswith(".safetensors"))
+def analyze(
+    siblings: tuple[SiblingFile, ...],
+    total_params: int | None,
+    fingerprint: QuantFingerprint | None = None,
+) -> WeightReport:
+    """Compute weight report from sibling files + param count.
+    `total_params` comes from summing across the architecture (computed elsewhere)
+    or is None if we couldn't determine it — in which case we skip the inference
+    step and return raw file size only.
+    `fingerprint` (optional) is authoritative evidence from config.json or
+    safetensors header. When present, it overrides the bpp nearest-match
+    heuristic for quantization_guess (VERIFIED instead of INFERRED).
+    """
+    observed_bytes = _safetensors_total_bytes(siblings)
+    total_bytes = AnnotatedValue(
+        observed_bytes,
+        Label.VERIFIED,
+        source="sum of safetensors siblings from model_info API",
+    )
+    if not total_params or observed_bytes == 0:
+        return WeightReport(
+            total_bytes=total_bytes,
+            bits_per_param=None,
+            quantization_guess=AnnotatedValue(
+                "UNKNOWN",
+                Label.UNKNOWN,
+                source="total_params unknown or no safetensors files",
+            ),
+        )
+    bpp = observed_bytes / total_params
+    bits_per_param = AnnotatedValue(
+        bpp * 8,
+        Label.INFERRED,
+        source=f"{observed_bytes} bytes / {total_params} params",
+    )
+    if fingerprint is not None:
+        quant: AnnotatedValue[QuantizationScheme] = AnnotatedValue(
+            fingerprint.scheme,
+            Label.VERIFIED,
+            source=fingerprint.evidence,
+        )
+    else:
+        quant = _guess_quantization(bpp)
+    return WeightReport(
+        total_bytes=total_bytes,
+        bits_per_param=bits_per_param,
+        quantization_guess=quant,
+    )
+def _guess_quantization(bpp: float) -> AnnotatedValue[QuantizationScheme]:
+    """Nearest-match heuristic.
+    Tolerance ±0.10 bits/param for mixed-precision schemes (scale tensors,
+    FP16 embeddings, etc.); ±0.05 for pure schemes. See Success Criteria #2.
+    """
+    # Ordered so closest anchor wins on ties
+    candidates: list[tuple[QuantizationScheme, float, float]] = [
+        ("FP16", _QUANT_BPP["FP16"], 0.05),
+        ("FP8", _QUANT_BPP["FP8"], 0.05),
+        ("FP4_FP8_MIXED", _QUANT_BPP["FP4_FP8_MIXED"], 0.10),
+        ("INT4", _QUANT_BPP["INT4"], 0.05),
+        ("GPTQ_INT4", _QUANT_BPP["GPTQ_INT4"], 0.10),
+    ]
+    best: tuple[QuantizationScheme, float] | None = None
+    for scheme, anchor_bpp, tolerance in candidates:
+        delta = abs(bpp - anchor_bpp)
+        if delta <= tolerance and (best is None or delta < best[1]):
+            best = (scheme, delta)
+    if best is None:
+        return AnnotatedValue(
+            "UNKNOWN",
+            Label.UNKNOWN,
+            source=f"bits/param {bpp * 8:.2f} does not match known schemes",
+        )
+    return AnnotatedValue(
+        best[0],
+        Label.INFERRED,
+        source=f"bits/param {bpp * 8:.2f} within tolerance of {best[0]}",
+    )

src/llm_cal/weight_analyzer/fingerprint.py ADDED Viewed

	@@ -0,0 +1,292 @@

+"""Quantization fingerprinting — tie-breakers for the reconciler.
+When `reconciler.reconcile` has multiple schemes tied at the same bits/param
+(FP4_FP8_MIXED, GPTQ_INT4, and AWQ_INT4 all sit at bpp=0.55), bytes alone
+cannot pick a winner. We resolve the ambiguity with two stronger signals:
+  1. `quantization_config` in config.json — explicit declaration by the model
+     author. Covers most GPTQ/AWQ/FP8 community uploads.
+  2. safetensors per-tensor dtype + tensor-name patterns — the ground truth.
+     Covers models like DeepSeek-V4-Flash that use custom mixed-precision
+     packs without a config.json declaration.
+Both return a `QuantFingerprint`. The reconciler uses the fingerprint's
+`scheme` as a tie-breaker, and the `evidence` string flows into the
+derivation trace.
+This module is pure — no network, no file I/O. `safetensors_reader.py`
+handles fetching; this module interprets what was fetched.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Any, Literal
+from llm_cal.weight_analyzer import QuantizationScheme
+SourceType = Literal["config_json", "safetensors_header"]
+@dataclass(frozen=True)
+class QuantFingerprint:
+    scheme: QuantizationScheme
+    source_type: SourceType
+    evidence: str  # for the derivation trace
+# ---------------------------------------------------------------------------
+# Config.json: explicit quant_method declaration
+def from_config(config: dict[str, Any]) -> QuantFingerprint | None:
+    """Read `config.json` `quantization_config` and map to a scheme.
+    Returns None if no `quantization_config` block exists (model either
+    unquantized in-config or uses a per-tensor pack without declaration).
+    """
+    qc = config.get("quantization_config")
+    if not isinstance(qc, dict):
+        return None
+    quant_method = qc.get("quant_method")
+    bits = qc.get("bits")
+    weight_dtype = qc.get("weight_dtype")
+    # GPTQ family
+    if quant_method == "gptq":
+        if bits == 4:
+            return QuantFingerprint(
+                scheme="GPTQ_INT4",
+                source_type="config_json",
+                evidence="config.json quantization_config.quant_method=gptq, bits=4",
+            )
+        if bits == 8:
+            return QuantFingerprint(
+                scheme="INT8",
+                source_type="config_json",
+                evidence="config.json quantization_config.quant_method=gptq, bits=8",
+            )
+    # AWQ family
+    if quant_method == "awq" and bits == 4:
+        return QuantFingerprint(
+            scheme="AWQ_INT4",
+            source_type="config_json",
+            evidence="config.json quantization_config.quant_method=awq, bits=4",
+        )
+    # FP8 (native or compressed-tensors wrapping)
+    if quant_method == "fp8":
+        return QuantFingerprint(
+            scheme="FP8",
+            source_type="config_json",
+            evidence="config.json quantization_config.quant_method=fp8",
+        )
+    # compressed-tensors (RedHatAI etc.) — inspect inner weight dtype
+    if quant_method == "compressed-tensors":
+        # The config_groups.group_0.weights.type can be "float", "int", etc.
+        # and num_bits gives 4/8. For v0.1.2 we handle the two common cases.
+        groups = qc.get("config_groups") or {}
+        # Pick the first group; schemas with heterogeneous groups degrade
+        # gracefully to None (reconciler stays in tied state).
+        for g in groups.values():
+            if not isinstance(g, dict):
+                continue
+            weights = g.get("weights") or {}
+            num_bits = weights.get("num_bits")
+            wtype = weights.get("type")
+            if num_bits == 8 and wtype in ("float", "fp8"):
+                return QuantFingerprint(
+                    scheme="FP8",
+                    source_type="config_json",
+                    evidence="config.json compressed-tensors group weights=fp8/8bit",
+                )
+            if num_bits == 8 and wtype == "int":
+                return QuantFingerprint(
+                    scheme="INT8",
+                    source_type="config_json",
+                    evidence="config.json compressed-tensors group weights=int/8bit",
+                )
+            if num_bits == 4 and wtype == "int":
+                # Generic INT4 — don't claim GPTQ or AWQ without more evidence
+                return QuantFingerprint(
+                    scheme="INT4",
+                    source_type="config_json",
+                    evidence="config.json compressed-tensors group weights=int/4bit",
+                )
+            break  # first group only
+    # bitsandbytes — load_in_4bit / load_in_8bit flags
+    if quant_method == "bitsandbytes":
+        if qc.get("load_in_4bit"):
+            return QuantFingerprint(
+                scheme="INT4",
+                source_type="config_json",
+                evidence="config.json quant_method=bitsandbytes, load_in_4bit=true",
+            )
+        if qc.get("load_in_8bit"):
+            return QuantFingerprint(
+                scheme="INT8",
+                source_type="config_json",
+                evidence="config.json quant_method=bitsandbytes, load_in_8bit=true",
+            )
+    # Standalone weight_dtype (no nested groups — some custom loaders)
+    if weight_dtype in ("float8_e4m3fn", "float8_e5m2"):
+        return QuantFingerprint(
+            scheme="FP8",
+            source_type="config_json",
+            evidence=f"config.json quantization_config.weight_dtype={weight_dtype}",
+        )
+    return None
+# ---------------------------------------------------------------------------
+# Safetensors header: per-tensor dtype + tensor-name patterns
+# safetensors dtype strings (from the format spec)
+_FP8_DTYPES = frozenset({"F8_E4M3", "F8_E5M2"})
+_FP4_DTYPES = frozenset({"F4_E2M1", "F4"})  # F4 is used by some toolchains
+_FP16_DTYPES = frozenset({"F16"})
+_BF16_DTYPES = frozenset({"BF16"})
+_INT8_DTYPES = frozenset({"I8", "U8"})
+# F8_E8M0 is the 8-bit shared-exponent scaling factor used by MX-format
+# block-scaled quantization (MXFP4, MXFP8). Its presence alongside packed
+# integer weights (I8) is the signature of FP4 weight packing.
+_MX_SCALE_DTYPES = frozenset({"F8_E8M0"})
+def from_safetensors_dtypes(tensor_dtypes: dict[str, str]) -> QuantFingerprint | None:
+    """Fingerprint from a parsed safetensors header (tensor_name -> dtype string).
+    Only considers "weight-like" tensors. Non-weight tensors (norms, biases,
+    embeddings, LayerNorm params) often stay in FP16/BF16 even in heavily
+    quantized models, so counting them directly would give a wrong picture.
+    """
+    if not tensor_dtypes:
+        return None
+    names = set(tensor_dtypes.keys())
+    # ------------------------------------------------------------------
+    # Packed-int4 schemes first — they have distinctive tensor-name markers
+    # even though the underlying dtype is I32 (bit-packed).
+    has_qweight = any(n.endswith(".qweight") or n.endswith("_qweight") for n in names)
+    has_g_idx = any(n.endswith(".g_idx") or n.endswith("_g_idx") for n in names)
+    has_qzeros = any(n.endswith(".qzeros") or n.endswith("_qzeros") for n in names)
+    if has_qweight and has_g_idx:
+        return QuantFingerprint(
+            scheme="GPTQ_INT4",
+            source_type="safetensors_header",
+            evidence="safetensors header has .qweight + .g_idx tensors (GPTQ marker)",
+        )
+    if has_qweight and has_qzeros and not has_g_idx:
+        return QuantFingerprint(
+            scheme="AWQ_INT4",
+            source_type="safetensors_header",
+            evidence="safetensors header has .qweight + .qzeros, no .g_idx (AWQ marker)",
+        )
+    # ------------------------------------------------------------------
+    # Dtype histogram over weight-like tensors.
+    # Skip norms / biases / embeddings which typically don't get quantized.
+    def _is_weight_tensor(name: str) -> bool:
+        lname = name.lower()
+        if any(sub in lname for sub in (".norm", ".bias", "embed", "lm_head")):
+            return False
+        # Tensor names in transformer models usually contain "weight"
+        return "weight" in lname or lname.endswith(".w") or lname.endswith(".proj")
+    weight_dtypes: list[str] = [dt for n, dt in tensor_dtypes.items() if _is_weight_tensor(n)]
+    if not weight_dtypes:
+        # Fall back to all dtypes if the name heuristic found nothing
+        weight_dtypes = list(tensor_dtypes.values())
+    has_fp4 = any(dt in _FP4_DTYPES for dt in weight_dtypes)
+    has_fp8 = any(dt in _FP8_DTYPES for dt in weight_dtypes)
+    has_fp16 = any(dt in _FP16_DTYPES for dt in weight_dtypes)
+    has_bf16 = any(dt in _BF16_DTYPES for dt in weight_dtypes)
+    has_int8 = any(dt in _INT8_DTYPES for dt in weight_dtypes)
+    has_mx_scale = any(dt in _MX_SCALE_DTYPES for dt in tensor_dtypes.values())
+    # MX-format block-scaled quantization (DeepSeek-V4-Flash pattern):
+    # F8_E8M0 scale tensors + packed I8 weights, plus a layer of F8_E4M3 for
+    # the FP8 sub-pack. Detected via the scale-dtype signature.
+    if has_mx_scale and has_int8:
+        if has_fp8:
+            return QuantFingerprint(
+                scheme="FP4_FP8_MIXED",
+                source_type="safetensors_header",
+                evidence=(
+                    f"safetensors header: F8_E8M0 scale tensors + "
+                    f"{sum(dt in _INT8_DTYPES for dt in weight_dtypes)} packed-I8 "
+                    f"(FP4) weights + "
+                    f"{sum(dt in _FP8_DTYPES for dt in weight_dtypes)} FP8 weights — "
+                    f"MX block-scaled mixed pack"
+                ),
+            )
+        # MXFP4 only — nominally INT4 but with the MX scaling envelope
+        return QuantFingerprint(
+            scheme="FP4_FP8_MIXED",  # closest existing scheme; bpp ≈ 0.55 anchor
+            source_type="safetensors_header",
+            evidence=(
+                f"safetensors header: F8_E8M0 scale tensors + "
+                f"{sum(dt in _INT8_DTYPES for dt in weight_dtypes)} packed-I8 "
+                f"(FP4) weights — MXFP4 block-scaled"
+            ),
+        )
+    # Classic FP4 + FP8 mixed (older toolchains exposing F4 dtype directly)
+    if has_fp4 and has_fp8:
+        return QuantFingerprint(
+            scheme="FP4_FP8_MIXED",
+            source_type="safetensors_header",
+            evidence=(
+                f"safetensors header has both FP4 and FP8 weight tensors "
+                f"({sum(dt in _FP4_DTYPES for dt in weight_dtypes)} FP4, "
+                f"{sum(dt in _FP8_DTYPES for dt in weight_dtypes)} FP8)"
+            ),
+        )
+    # Pure FP8 — every weight tensor is F8_E4M3 or F8_E5M2
+    if has_fp8 and not (has_fp4 or has_int8):
+        fp8_count = sum(dt in _FP8_DTYPES for dt in weight_dtypes)
+        return QuantFingerprint(
+            scheme="FP8",
+            source_type="safetensors_header",
+            evidence=f"safetensors header: {fp8_count}/{len(weight_dtypes)} weight tensors are FP8",
+        )
+    # Pure FP16
+    if has_fp16 and not (has_fp8 or has_fp4 or has_int8 or has_bf16):
+        return QuantFingerprint(
+            scheme="FP16",
+            source_type="safetensors_header",
+            evidence=f"safetensors header: all {len(weight_dtypes)} weight tensors are F16",
+        )
+    # Pure BF16
+    if has_bf16 and not (has_fp8 or has_fp4 or has_int8 or has_fp16):
+        return QuantFingerprint(
+            scheme="BF16",
+            source_type="safetensors_header",
+            evidence=f"safetensors header: all {len(weight_dtypes)} weight tensors are BF16",
+        )
+    # Pure INT8
+    if has_int8 and not (has_fp8 or has_fp4 or has_fp16 or has_bf16):
+        return QuantFingerprint(
+            scheme="INT8",
+            source_type="safetensors_header",
+            evidence=f"safetensors header: {len(weight_dtypes)} weight tensors are INT8",
+        )
+    # Mixed in a way we don't have a named scheme for — stay silent
+    return None

src/llm_cal/weight_analyzer/reconciler.py ADDED Viewed

	@@ -0,0 +1,247 @@

+"""Reconciler — compare observed weight bytes vs computed under each quantization assumption.
+This is the module that outputs the DeepSeek-V4-Flash story (Problem Evidence in design doc):
+"gpu_poor says 285 GB (assumes pure FP8); we say 160 GB (observed bytes match FP4+FP8
+ pack hypothesis). Here's why."
+Core value: makes the quantization inference step transparent. The user sees all
+candidates considered, not just the winner.
+When multiple schemes share the same bytes-per-param anchor (FP4_FP8_MIXED,
+GPTQ_INT4, and AWQ_INT4 all sit at bpp=0.55), bytes alone cannot pick a winner.
+Pass a `QuantFingerprint` from `fingerprint.from_config()` or
+`fingerprint.from_safetensors_dtypes()` to break the tie with authoritative
+evidence.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from llm_cal.output.labels import AnnotatedValue, Label
+from llm_cal.weight_analyzer import _QUANT_BPP, QuantizationScheme
+from llm_cal.weight_analyzer.fingerprint import QuantFingerprint
+@dataclass(frozen=True)
+class ReconciliationCandidate:
+    scheme: QuantizationScheme
+    predicted_bytes: int
+    delta_bytes: int  # observed - predicted (positive = observed is larger)
+    relative_error: float  # |delta| / predicted
+@dataclass(frozen=True)
+class ReconciliationReport:
+    observed_bytes: int
+    total_params: int
+    candidates: tuple[ReconciliationCandidate, ...]  # sorted by |relative_error| asc
+    best: AnnotatedValue[QuantizationScheme]
+    def summary_line(self) -> str:
+        """One-liner for output formatter."""
+        if not self.candidates:
+            return f"{self.observed_bytes:,} bytes — no quantization candidates tested"
+        c = self.candidates[0]
+        return (
+            f"Observed {self.observed_bytes:,} bytes. "
+            f"Best match: {c.scheme} "
+            f"(predicts {c.predicted_bytes:,} bytes, "
+            f"{c.relative_error * 100:.1f}% error)"
+        )
+# Tolerance for tie detection — schemes within this relative-error delta of the
+# winner are considered tied.
+_TIE_THRESHOLD = 0.01
+# Tolerance gate — if the closest candidate is off by more than this, call UNKNOWN.
+_UNKNOWN_THRESHOLD = 0.15
+def reconcile(
+    observed_bytes: int,
+    total_params: int,
+    fingerprint: QuantFingerprint | None = None,
+) -> ReconciliationReport:
+    """Compare observed file bytes against every known quantization scheme.
+    Args:
+        observed_bytes: Sum of safetensors file sizes.
+        total_params: Estimated param count.
+        fingerprint: Optional authoritative evidence from config.json or
+            safetensors header. Breaks bpp ties and annotates the source.
+    Returns full ranking so the formatter can show "gpu_poor would say X; we say Y."
+    """
+    if observed_bytes == 0 or total_params == 0:
+        return ReconciliationReport(
+            observed_bytes=observed_bytes,
+            total_params=total_params,
+            candidates=(),
+            best=AnnotatedValue(
+                "UNKNOWN",
+                Label.UNKNOWN,
+                source="observed_bytes or total_params is zero",
+            ),
+        )
+    candidates: list[ReconciliationCandidate] = []
+    for scheme, bpp in _QUANT_BPP.items():
+        if scheme == "UNKNOWN" or bpp == 0.0:
+            continue
+        predicted = int(bpp * total_params)
+        delta = observed_bytes - predicted
+        rel_err = abs(delta) / predicted if predicted else float("inf")
+        candidates.append(
+            ReconciliationCandidate(
+                scheme=scheme,
+                predicted_bytes=predicted,
+                delta_bytes=delta,
+                relative_error=rel_err,
+            )
+        )
+    candidates.sort(key=lambda c: c.relative_error)
+    argmin_scheme = candidates[0].scheme
+    argmin_err = candidates[0].relative_error
+    # Fingerprint path: authoritative declaration from config.json or safetensors
+    # header. This is the primary fix for the tie that LLM review caught.
+    if fingerprint is not None:
+        return _reconcile_with_fingerprint(
+            observed_bytes=observed_bytes,
+            total_params=total_params,
+            candidates=tuple(candidates),
+            fingerprint=fingerprint,
+            argmin_scheme=argmin_scheme,
+            argmin_err=argmin_err,
+        )
+    # Tolerance gate without fingerprint
+    if argmin_err > _UNKNOWN_THRESHOLD:
+        return ReconciliationReport(
+            observed_bytes=observed_bytes,
+            total_params=total_params,
+            candidates=tuple(candidates),
+            best=AnnotatedValue(
+                "UNKNOWN",
+                Label.UNKNOWN,
+                source=(
+                    f"closest candidate ({argmin_scheme}) is off by "
+                    f"{argmin_err * 100:.1f}% — no confident match"
+                ),
+            ),
+        )
+    # Bytes-only tie detection
+    tied_schemes = [
+        c.scheme
+        for c in candidates
+        if abs(c.relative_error - argmin_err) < _TIE_THRESHOLD
+        and c.relative_error <= _UNKNOWN_THRESHOLD
+    ]
+    if len(tied_schemes) > 1:
+        tie_note = (
+            f" — tied with {', '.join(s for s in tied_schemes if s != argmin_scheme)} "
+            f"at the same bits/param; distinguishing requires config.json "
+            f"quantization_config or safetensors per-tensor dtype "
+            f"(neither available for this model)"
+        )
+        source_text = (
+            f"best match among {len(candidates)} candidates, "
+            f"{argmin_err * 100:.1f}% error{tie_note}"
+        )
+    else:
+        source_text = (
+            f"best match among {len(candidates)} candidates, {argmin_err * 100:.1f}% error"
+        )
+    return ReconciliationReport(
+        observed_bytes=observed_bytes,
+        total_params=total_params,
+        candidates=tuple(candidates),
+        best=AnnotatedValue(argmin_scheme, Label.INFERRED, source=source_text),
+    )
+def _reconcile_with_fingerprint(
+    observed_bytes: int,
+    total_params: int,
+    candidates: tuple[ReconciliationCandidate, ...],
+    fingerprint: QuantFingerprint,
+    argmin_scheme: QuantizationScheme,
+    argmin_err: float,
+) -> ReconciliationReport:
+    """Fingerprint-driven path.
+    Rules:
+      - If the declared scheme is in the candidates AND its bytes-error is within
+        tolerance → adopt it. Label VERIFIED (we're reading authoritative metadata,
+        not inferring).
+      - If declared scheme's bytes-error is > 15% → conflict. Still adopt the
+        declared scheme but log the discrepancy. This usually means our param
+        estimate is off, not that the declaration is wrong.
+      - If declared scheme is unknown to us → fall back to argmin with note.
+    """
+    declared = fingerprint.scheme
+    match = next((c for c in candidates if c.scheme == declared), None)
+    if match is None:
+        # Unknown scheme from fingerprint — degrade gracefully to bytes-only.
+        return ReconciliationReport(
+            observed_bytes=observed_bytes,
+            total_params=total_params,
+            candidates=candidates,
+            best=AnnotatedValue(
+                argmin_scheme,
+                Label.INFERRED,
+                source=(
+                    f"fingerprint declared {declared} ({fingerprint.evidence}) "
+                    f"but we have no bpp anchor for it; fell back to bytes match "
+                    f"{argmin_scheme} at {argmin_err * 100:.1f}% error"
+                ),
+            ),
+        )
+    if match.relative_error <= _UNKNOWN_THRESHOLD:
+        # Agreement — fingerprint picks a plausible scheme. This is the happy path.
+        note = ""
+        # Extra context: if bytes alone would have chosen a different scheme, say so.
+        if declared != argmin_scheme and argmin_err < match.relative_error:
+            note = (
+                f" (bytes alone would argmin to {argmin_scheme} at "
+                f"{argmin_err * 100:.1f}%; we trust the declaration)"
+            )
+        return ReconciliationReport(
+            observed_bytes=observed_bytes,
+            total_params=total_params,
+            candidates=candidates,
+            best=AnnotatedValue(
+                declared,
+                Label.VERIFIED,
+                source=(
+                    f"{fingerprint.evidence} "
+                    f"(predicts {match.predicted_bytes:,} bytes, "
+                    f"{match.relative_error * 100:.1f}% error){note}"
+                ),
+            ),
+        )
+    # Disagreement: declared scheme's prediction is >15% off from observed bytes.
+    # Still trust the declaration — usually means our param estimate drifted.
+    return ReconciliationReport(
+        observed_bytes=observed_bytes,
+        total_params=total_params,
+        candidates=candidates,
+        best=AnnotatedValue(
+            declared,
+            Label.VERIFIED,
+            source=(
+                f"{fingerprint.evidence} "
+                f"(NOTE: bytes predict {match.predicted_bytes:,}, off by "
+                f"{match.relative_error * 100:.1f}% — likely our param estimate is off, "
+                f"not the declaration)"
+            ),
+        ),
+    )

src/llm_cal/weight_analyzer/safetensors_reader.py ADDED Viewed

	@@ -0,0 +1,163 @@

+"""Fetch the safetensors header of one shard to recover per-tensor dtypes.
+The safetensors binary format:
+  bytes[0..8]       uint64 little-endian  header length N (JSON bytes)
+  bytes[8..8+N]     UTF-8 JSON            tensor_name -> {dtype, shape, data_offsets}
+  bytes[8+N..]      raw tensor data       (we never read this)
+So we can identify every tensor's dtype without downloading any weight bytes.
+Headers are usually 50 KB - 2 MB. We cap the Range request at 16 MB as a
+safety net; anything larger is treated as malformed.
+This module NEVER raises on network or parse error — it returns None so
+the caller can degrade gracefully. The honesty principle: "we tried and
+failed to resolve the tie" is a legitimate outcome, not a fatal error.
+"""
+from __future__ import annotations
+import json
+import struct
+from typing import Any
+import httpx
+from llm_cal.model_source.auth import get_hf_token, get_modelscope_token
+from llm_cal.model_source.base import SiblingFile
+_MAX_HEADER_BYTES = 16 * 1024 * 1024  # 16 MB — far above any realistic header
+_RANGE_FETCH_BYTES = 16 * 1024 * 1024
+_DEFAULT_TIMEOUT_S = 15.0
+def pick_sample_shard(siblings: tuple[SiblingFile, ...]) -> SiblingFile | None:
+    """Choose one safetensors file that's representative of the model.
+    Preference order:
+      1. `model.safetensors` (single-file case — always representative)
+      2. The middle shard for multi-shard models. The first shard tends to
+         contain embeddings + lm_head + early-layer norms (often left in
+         BF16/FP16 even when the bulk of the model is quantized to FP4 or
+         FP8). The middle shard typically holds real decoder/MoE-expert
+         weights, so its dtype histogram is more representative of the
+         "headline" quantization.
+      3. Any `*.safetensors` if naming doesn't follow the shard convention.
+    """
+    st_files = [s for s in siblings if s.filename.endswith(".safetensors")]
+    if not st_files:
+        return None
+    for s in st_files:
+        if s.filename == "model.safetensors":
+            return s
+    sorted_shards = sorted(st_files, key=lambda s: s.filename)
+    return sorted_shards[len(sorted_shards) // 2]
+def fetch_tensor_dtypes(
+    source: str,
+    model_id: str,
+    revision: str,
+    shard_filename: str,
+    endpoint: str | None = None,
+    timeout_s: float = _DEFAULT_TIMEOUT_S,
+) -> dict[str, str] | None:
+    """Range-fetch the safetensors header of one shard and return dtype map.
+    Returns a dict of `{tensor_name: dtype_string}` on success, None on any
+    failure (network, parse, unexpected format). Non-fatal by design.
+    Supports HuggingFace and ModelScope. Other sources fall back to None
+    so the reconciler still reports a verdict (without per-tensor refinement).
+    """
+    url, headers = _build_request(source, model_id, revision, shard_filename, endpoint)
+    if url is None:
+        return None
+    headers = {**headers, "Range": f"bytes=0-{_RANGE_FETCH_BYTES - 1}"}
+    try:
+        resp = httpx.get(url, headers=headers, timeout=timeout_s, follow_redirects=True)
+    except (httpx.TimeoutException, httpx.ConnectError, httpx.HTTPError):
+        return None
+    # 200 for small files returned in full; 206 for actual Range response.
+    # Anything else (404, 403, 500, ...) we degrade silently.
+    if resp.status_code not in (200, 206):
+        return None
+    return parse_header(resp.content)
+def _build_request(
+    source: str,
+    model_id: str,
+    revision: str,
+    shard_filename: str,
+    endpoint: str | None,
+) -> tuple[str | None, dict[str, str]]:
+    """Compose URL + auth headers for the source. Returns (None, {}) on unknown."""
+    if source == "huggingface":
+        base = (endpoint or "https://huggingface.co").rstrip("/")
+        url = f"{base}/{model_id}/resolve/{revision}/{shard_filename}"
+        token = get_hf_token()
+        headers = {"Authorization": f"Bearer {token}"} if token else {}
+        return url, headers
+    if source == "modelscope":
+        # ModelScope raw-file endpoint takes the path via query string and
+        # 302-redirects to the underlying OSS object. httpx follows the
+        # redirect; OSS honors Range natively.
+        base = (endpoint or "https://www.modelscope.cn").rstrip("/")
+        # httpx will encode query params; build manually to keep this function
+        # ergonomically a one-liner that matches the rest of the module.
+        url = (
+            f"{base}/api/v1/models/{model_id}/repo"
+            f"?FilePath={shard_filename}&Revision={revision}"
+        )
+        token = get_modelscope_token()
+        headers = {"Authorization": f"Bearer {token}"} if token else {}
+        return url, headers
+    return None, {}
+def parse_header(content: bytes) -> dict[str, str] | None:
+    """Parse the safetensors binary header from a leading byte buffer.
+    Pure function — safe to call on any bytes. Returns None on any malformed
+    input rather than raising.
+    """
+    if len(content) < 8:
+        return None
+    try:
+        (header_len,) = struct.unpack("<Q", content[:8])
+    except struct.error:
+        return None
+    if header_len == 0 or header_len > _MAX_HEADER_BYTES:
+        return None
+    if len(content) < 8 + header_len:
+        return None
+    header_bytes = content[8 : 8 + header_len]
+    try:
+        header: Any = json.loads(header_bytes)
+    except (json.JSONDecodeError, UnicodeDecodeError):
+        return None
+    if not isinstance(header, dict):
+        return None
+    dtypes: dict[str, str] = {}
+    for name, info in header.items():
+        if name == "__metadata__":
+            continue
+        if not isinstance(info, dict):
+            continue
+        dtype = info.get("dtype")
+        if isinstance(dtype, str):
+            dtypes[name] = dtype
+    return dtypes if dtypes else None