Spaces:
Running
Running
| """llm-cal Gradio web app — deploys to HuggingFace Spaces. | |
| User journey: | |
| 1. Type a HuggingFace model id (or pick from examples) | |
| 2. Choose target GPU | |
| 3. Hit Calculate | |
| 4. Read the same `--explain`-quality output the CLI gives you, but in a browser | |
| and shareable via URL parameters. | |
| The whole compute is the existing Python `Evaluator`. No new logic. | |
| Local run: | |
| python web/app.py | |
| HF Spaces: | |
| This file is the entry point Spaces expects. requirements.txt sits next to it. | |
| """ | |
| from __future__ import annotations | |
| import sys | |
| from pathlib import Path | |
| # Ensure src/ is importable. Two layouts supported: | |
| # 1. Local dev: /repo/web/app.py + /repo/src/ (parent.parent / src) | |
| # 2. HF Space: /space/app.py + /space/src/ (parent / src) | |
| # The deploy workflow flattens layout 1 → layout 2 when pushing to the Space. | |
| _HERE = Path(__file__).resolve().parent | |
| for _candidate in (_HERE / "src", _HERE.parent / "src"): | |
| if _candidate.exists(): | |
| sys.path.insert(0, str(_candidate)) | |
| break | |
| import os # noqa: E402 | |
| import gradio as gr # noqa: E402 | |
| from llm_cal.common.i18n import set_locale, t # noqa: E402 | |
| from llm_cal.core.evaluator import EvaluationReport, Evaluator # noqa: E402 | |
| from llm_cal.core.explain import ExplainEntry # noqa: E402 | |
| from llm_cal.core.explain import build as build_explain # noqa: E402 | |
| from llm_cal.hardware.loader import load_database # noqa: E402 | |
| from llm_cal.llm_review.reviewer import run_review # noqa: E402 | |
| from llm_cal.model_source.huggingface import HuggingFaceSource # noqa: E402 | |
| from llm_cal.model_source.modelscope import ModelScopeSource # noqa: E402 | |
| # --------------------------------------------------------------------------- | |
| # Static data the UI needs | |
| _DB = load_database() | |
| def _classify_vendor(gpu_id: str) -> tuple[str, str]: | |
| """Map a GPU id to (vendor_en, vendor_zh). | |
| Vendor isn't in the YAML schema (yet), so derive from the id prefix. | |
| """ | |
| gid = gpu_id.upper() | |
| if gid in {"B200", "GB200", "H100", "H800", "H200", "H20", "GH200"} or gid.startswith( | |
| ("L4", "L40", "RTX", "A10", "A100", "A40", "V100", "T4") | |
| ): | |
| return ("NVIDIA", "NVIDIA") | |
| if gid.startswith("MI"): | |
| return ("AMD", "AMD") | |
| if gid.startswith("GAUDI"): | |
| return ("Intel Habana", "英特尔 Habana") | |
| if gid.startswith("910") or gid.startswith("ATLAS"): | |
| return ("Huawei Ascend", "华为昇腾") | |
| if gid.startswith("MXC"): | |
| return ("MetaX 沐曦", "沐曦 MetaX") | |
| if gid.startswith("KUNLUN"): | |
| return ("Kunlunxin 昆仑芯", "昆仑芯 Kunlunxin") | |
| if gid.startswith("BR"): | |
| return ("Biren 壁仞", "壁仞 Biren") | |
| if gid.startswith("BI-"): | |
| return ("Iluvatar 天数智芯", "天数智芯 Iluvatar") | |
| if gid.startswith(("MR-", "MTT")): | |
| return ("Moore Threads 摩尔线程", "摩尔线程 Moore Threads") | |
| if gid.startswith("MLU"): | |
| return ("Cambricon 寒武纪", "寒武纪 Cambricon") | |
| if gid.startswith("HYGON"): | |
| return ("Hygon 海光", "海光 Hygon") | |
| return ("Other", "其他") | |
| # Stable vendor display order | |
| _VENDOR_ORDER = [ | |
| "NVIDIA", | |
| "AMD", | |
| "Intel Habana", | |
| "Huawei Ascend", | |
| "MetaX 沐曦", | |
| "Kunlunxin 昆仑芯", | |
| "Biren 壁仞", | |
| "Iluvatar 天数智芯", | |
| "Moore Threads 摩尔线程", | |
| "Cambricon 寒武纪", | |
| "Hygon 海光", | |
| "Other", | |
| ] | |
| def _build_vendor_index() -> dict[str, list[str]]: | |
| """vendor_en -> sorted list of GPU ids""" | |
| out: dict[str, list[str]] = {v: [] for v in _VENDOR_ORDER} | |
| for g in _DB.gpus: | |
| v_en, _ = _classify_vendor(g.id) | |
| out.setdefault(v_en, []).append(g.id) | |
| for v in out: | |
| out[v].sort() | |
| # Drop empty buckets | |
| return {v: ids for v, ids in out.items() if ids} | |
| _VENDOR_TO_GPUS = _build_vendor_index() | |
| VENDOR_CHOICES_EN: list[str] = list(_VENDOR_TO_GPUS.keys()) | |
| DEFAULT_VENDOR = "NVIDIA" | |
| DEFAULT_GPU = "H800" | |
| EXAMPLE_MODELS: list[tuple[str, str, str, str, str]] = [ | |
| # (model_id, vendor, gpu, engine, source) | |
| ("deepseek-ai/DeepSeek-V4-Flash", "NVIDIA", "H800", "vllm", "HuggingFace"), | |
| ("deepseek-ai/DeepSeek-V3", "NVIDIA", "H800", "vllm", "HuggingFace"), | |
| ("Qwen/Qwen2.5-72B-Instruct", "NVIDIA", "H100", "vllm", "HuggingFace"), | |
| ("Qwen/Qwen3-30B-A3B", "NVIDIA", "A100-80G", "vllm", "HuggingFace"), | |
| ("mistralai/Mixtral-8x7B-v0.1", "NVIDIA", "H100", "vllm", "HuggingFace"), | |
| ("microsoft/Phi-4", "NVIDIA", "RTX4090", "vllm", "HuggingFace"), | |
| ("deepseek-ai/DeepSeek-V4-Flash", "Huawei Ascend", "910B4", "vllm", "HuggingFace"), | |
| # ModelScope examples — same models, China-side mirror. | |
| ("Qwen/Qwen3-30B-A3B", "NVIDIA", "A100-80G", "vllm", "ModelScope"), | |
| ("deepseek-ai/DeepSeek-V3", "Huawei Ascend", "910B4", "vllm", "ModelScope"), | |
| ] | |
| # --------------------------------------------------------------------------- | |
| # Output rendering | |
| def _fmt_bytes(n: int | None) -> str: | |
| if n is None: | |
| return "—" | |
| if n < 1024: | |
| return f"{n} B" | |
| f = float(n) | |
| for u in ["KB", "MB", "GB", "TB"]: | |
| f /= 1024 | |
| if f < 1024: | |
| return f"{f:.2f} {u}" | |
| return f"{f:.2f} PB" | |
| def _fmt_params(n: int | None) -> str: | |
| if not n: | |
| return "—" | |
| if n >= 1_000_000_000: | |
| return f"{n / 1_000_000_000:.1f}B" | |
| if n >= 1_000_000: | |
| return f"{n / 1_000_000:.1f}M" | |
| return f"{n:,}" | |
| def _label_color(label: str) -> str: | |
| """Map a provenance label to a CSS color (visible in both light and dark).""" | |
| return { | |
| "verified": "#16a34a", # green-600 | |
| "inferred": "#2563eb", # blue-600 | |
| "estimated": "#d97706", # amber-600 | |
| "cited": "#7c3aed", # violet-600 | |
| "unverified": "#9a3412", # orange-800 | |
| "unknown": "#6b7280", # gray-500 | |
| "llm-opinion": "#db2777", # pink-600 | |
| }.get(label, "#6b7280") | |
| def _label_chip(label_key: str) -> str: | |
| """Render a [label] chip with the right color.""" | |
| color = _label_color(label_key) | |
| text = t(f"label.{label_key}") | |
| return ( | |
| f'<span class="lc-chip" style="background:{color}1a;color:{color};' | |
| f'border:1px solid {color}55">{text}</span>' | |
| ) | |
| def _stat_card(label: str, value: str, sublabel: str = "", chip: str = "") -> str: | |
| chip_html = f"<div class='lc-stat-chip'>{chip}</div>" if chip else "" | |
| sub_html = f"<div class='lc-stat-sub'>{sublabel}</div>" if sublabel else "" | |
| return ( | |
| f"<div class='lc-stat'>" | |
| f"<div class='lc-stat-value'>{value}</div>" | |
| f"<div class='lc-stat-label'>{label}</div>" | |
| f"{sub_html}{chip_html}" | |
| f"</div>" | |
| ) | |
| def _esc(s: str) -> str: | |
| return ( | |
| str(s) | |
| .replace("&", "&") | |
| .replace("<", "<") | |
| .replace(">", ">") | |
| ) | |
| def _render(report: EvaluationReport, locale: str) -> str: | |
| set_locale(locale) # type: ignore[arg-type] | |
| is_zh = locale == "zh" | |
| p, w, r, f = report.profile, report.weight, report.reconciliation, report.fleet | |
| # ---- Headline stat cards ------------------------------------------------- | |
| weight_str = _fmt_bytes(w.total_bytes.value) | |
| weight_chip = _label_chip(w.total_bytes.label.value) | |
| quant_chip = _label_chip(w.quantization_guess.label.value) | |
| prod_opt = ( | |
| next((o for o in (f.options if f else []) if o.tier == "prod"), None) if f else None | |
| ) | |
| prod_gpus = str(prod_opt.gpu_count) if prod_opt else "—" | |
| prod_concurrent = str(prod_opt.max_concurrent_at_reference_ctx) if prod_opt else "—" | |
| headline = ( | |
| f"<div class='lc-header'>" | |
| f"<div class='lc-title'>{_esc(report.model_id)}</div>" | |
| f"<div class='lc-subtitle'>" | |
| f"{_esc(report.gpu)} · {_esc(report.engine)}" | |
| f"</div></div>" | |
| f"<div class='lc-stats'>" | |
| f"{_stat_card('Weight' if not is_zh else '权重', weight_str, sublabel='from safetensors API' if not is_zh else '取自 safetensors API', chip=weight_chip)}" | |
| f"{_stat_card('Quantization' if not is_zh else '量化', _esc(w.quantization_guess.value), sublabel='resolved scheme' if not is_zh else '已识别方案', chip=quant_chip)}" | |
| f"{_stat_card('Prod GPUs' if not is_zh else 'Prod GPU 数', prod_gpus, sublabel='for 16-user prod' if not is_zh else '生产档(16 路并发)')}" | |
| f"{_stat_card('Users @ 128K' if not is_zh else '用户 @ 128K', prod_concurrent, sublabel='concurrent at prod tier' if not is_zh else '生产档的并发')}" | |
| f"</div>" | |
| ) | |
| # Provenance footer for the headline | |
| quant_source = _esc(w.quantization_guess.source or "") | |
| headline += f"<div class='lc-prov'>{quant_source}</div>" | |
| # ---- Architecture -------------------------------------------------------- | |
| arch_rows: list[tuple[str, str]] = [("model_type", p.model_type)] | |
| if p.attention: | |
| arch_rows.append( | |
| ( | |
| "attention", | |
| f"{p.attention.variant} (heads={p.attention.num_heads}, " | |
| f"kv_heads={p.attention.num_kv_heads}, hd={p.attention.head_dim})", | |
| ) | |
| ) | |
| if p.moe: | |
| arch_rows.append( | |
| ( | |
| "moe", | |
| f"{p.moe.num_routed_experts} routed + " | |
| f"{p.moe.num_shared_experts} shared, top-{p.moe.num_experts_per_tok}", | |
| ) | |
| ) | |
| if p.sliding_window: | |
| arch_rows.append(("sliding_window", str(p.sliding_window))) | |
| arch_html = "".join( | |
| f"<tr><td><code>{_esc(k)}</code></td><td><code>{_esc(v)}</code></td></tr>" | |
| for k, v in arch_rows | |
| ) | |
| arch_explainer = ( | |
| "从模型 config.json 读出来的,决定后续所有公式怎么走(是否分组注意力、是否 MoE、是否滑动窗口)。" | |
| if is_zh | |
| else "Read straight from the model's config.json. Drives every formula " | |
| "downstream — attention sharding, MoE active-expert ratio, sliding window." | |
| ) | |
| arch_section = ( | |
| f"<div class='lc-section'><h3>{'架构' if is_zh else 'Architecture'}</h3>" | |
| f"<div class='lc-section-help'>{arch_explainer}</div>" | |
| f"<table class='lc-table'>{arch_html}</table></div>" | |
| ) | |
| # ---- Reconciliation ------------------------------------------------------ | |
| recon_rows = [] | |
| for c in r.candidates[:5]: | |
| is_best = c.scheme == r.best.value | |
| cls = " class='lc-best'" if is_best else "" | |
| marker = " ✓" if is_best else "" | |
| recon_rows.append( | |
| f"<tr{cls}><td><code>{_esc(c.scheme)}</code>{marker}</td>" | |
| f"<td>{_fmt_bytes(c.predicted_bytes)}</td>" | |
| f"<td>{c.relative_error * 100:.1f}%</td></tr>" | |
| ) | |
| recon_explainer = ( | |
| "用每种量化方案预测应该有多少字节,跟实际 safetensors 字节对比。误差最小的胜出。" | |
| "FP4_FP8_MIXED / GPTQ_INT4 / AWQ_INT4 在 0.55 bpp 处会打平,需要 config 或 dtype 进一步区分。" | |
| if is_zh | |
| else "Predict bytes under each quantization hypothesis, compare against the real " | |
| "safetensors size. Lowest error wins. FP4_FP8_MIXED / GPTQ_INT4 / AWQ_INT4 tie " | |
| "at 0.55 bpp — broken via config.json or per-tensor dtype." | |
| ) | |
| recon_section = ( | |
| f"<div class='lc-section'>" | |
| f"<h3>{'量化反演' if is_zh else 'Quantization reconciliation'}</h3>" | |
| f"<div class='lc-section-help'>{recon_explainer}</div>" | |
| f"<table class='lc-table lc-table-recon'>" | |
| f"<thead><tr><th>Scheme</th>" | |
| f"<th>{'预测字节' if is_zh else 'Predicted'}</th>" | |
| f"<th>{'误差' if is_zh else 'Error'}</th></tr></thead>" | |
| f"<tbody>{''.join(recon_rows)}</tbody></table></div>" | |
| ) | |
| # ---- Fleet --------------------------------------------------------------- | |
| fleet_section = "" | |
| if f and f.options: | |
| # Pick which context lengths get their own concurrency column. | |
| # Always include 128K if any option has it; also include the model max | |
| # if it's larger (e.g. 1M for DeepSeek-V4-Flash) so the user can compare | |
| # "fits 23 users at 128K but only 2 at 1M". | |
| all_ctxs: set[int] = set() | |
| for opt in f.options: | |
| for ctx, _ in opt.max_concurrent_by_context: | |
| all_ctxs.add(ctx) | |
| ctx_cols: list[int] = [] | |
| if 131_072 in all_ctxs: | |
| ctx_cols.append(131_072) | |
| max_ctx = max(all_ctxs) if all_ctxs else 0 | |
| if max_ctx > 131_072 and max_ctx not in ctx_cols: | |
| ctx_cols.append(max_ctx) | |
| if not ctx_cols and all_ctxs: | |
| ctx_cols.append(max_ctx) | |
| def _ctx_label(ctx: int) -> str: | |
| if ctx >= 1_000_000: | |
| return f"{ctx // 1_000_000}M" if ctx % 1_000_000 == 0 else f"{ctx / 1_000_000:.1f}M" | |
| if ctx >= 1024: | |
| return f"{ctx // 1024}K" | |
| return str(ctx) | |
| rows = [] | |
| for opt in f.options: | |
| star = " ★" if opt.tier == f.best_tier else "" | |
| cls = " class='lc-best'" if opt.tier == f.best_tier else "" | |
| headroom = max(0, opt.usable_bytes_per_gpu - opt.weight_bytes_per_gpu) | |
| ctx_map = dict(opt.max_concurrent_by_context) | |
| ctx_cells = "".join(f"<td>{ctx_map.get(c, '—')}</td>" for c in ctx_cols) | |
| rows.append( | |
| f"<tr{cls}><td><code>{opt.tier}{star}</code></td>" | |
| f"<td>{opt.gpu_count}</td>" | |
| f"<td>{_fmt_bytes(opt.weight_bytes_per_gpu)}</td>" | |
| f"<td>{_fmt_bytes(headroom)}</td>" | |
| f"{ctx_cells}</tr>" | |
| ) | |
| ctx_headers = "".join( | |
| f"<th>{('@ ' + _ctx_label(c) + ' 并发') if is_zh else ('Concurrent @ ' + _ctx_label(c))}</th>" | |
| for c in ctx_cols | |
| ) | |
| fleet_explainer = ( | |
| "min = 刚好放得下;dev = 8 路并发场景;prod = 16 路并发场景。★ = 推荐。" | |
| if is_zh | |
| else "min = barely fits weights; dev = sized for 8 concurrent at 128K; " | |
| "prod = sized for 16 concurrent at 128K. ★ = recommended." | |
| ) | |
| fleet_section = ( | |
| f"<div class='lc-section'>" | |
| f"<h3>{'推荐集群' if is_zh else 'Recommended fleet'}</h3>" | |
| f"<div class='lc-section-help'>{fleet_explainer}</div>" | |
| f"<table class='lc-table'>" | |
| f"<thead><tr><th>Tier</th><th>GPUs</th>" | |
| f"<th>Weight/GPU</th><th>Headroom/GPU</th>" | |
| f"{ctx_headers}</tr></thead>" | |
| f"<tbody>{''.join(rows)}</tbody></table></div>" | |
| ) | |
| # ---- Performance --------------------------------------------------------- | |
| perf_explainer = ( | |
| "Prefill 用算力公式(FLOPs = 2 × 参数 × 输入 token),decode 用带宽公式(吞吐 = 带宽 × 利用率 / 权重字节)。" | |
| "Bottleneck 标 memory_bandwidth 说明 decode 是带宽瓶颈,加显存带宽更高的 GPU 比加算力更划算。" | |
| if is_zh | |
| else "Prefill uses the compute formula (FLOPs = 2 × params × input_tokens, Kaplan 2020). " | |
| "Decode uses memory-bandwidth formula (tps = BW × util / weight_bytes, vLLM paper). " | |
| "Bottleneck = memory_bandwidth means a higher-BW GPU helps more than more FLOPS." | |
| ) | |
| perf_section = "" | |
| if report.prefill and report.decode and report.concurrency: | |
| max_users = report.concurrency.max_concurrent.value | |
| bn = report.concurrency.bottleneck | |
| items = [ | |
| ( | |
| "Prefill latency" if not is_zh else "Prefill 延迟", | |
| f"{report.prefill.latency_ms.value:.0f} ms", | |
| f"@ {report.perf_input_tokens or 2000} input tokens", | |
| ), | |
| ( | |
| "Cluster decode" if not is_zh else "集群 decode 吞吐", | |
| f"{report.decode.cluster_tokens_per_sec.value:.0f} tok/s", | |
| "", | |
| ), | |
| ( | |
| "Max concurrent users" if not is_zh else "最大并发用户", | |
| str(max_users), | |
| "", | |
| ), | |
| ( | |
| "Bottleneck" if not is_zh else "瓶颈", | |
| f"<code>{_esc(bn)}</code>", | |
| "", | |
| ), | |
| ] | |
| items_html = "".join( | |
| f"<div class='lc-perf-item'>" | |
| f"<div class='lc-perf-value'>{v}</div>" | |
| f"<div class='lc-perf-label'>{_esc(label)}</div>" | |
| f"<div class='lc-perf-sub'>{_esc(sub)}</div></div>" | |
| for label, v, sub in items | |
| ) | |
| perf_section = ( | |
| f"<div class='lc-section'>" | |
| f"<h3>{'性能' if is_zh else 'Performance'}</h3>" | |
| f"<div class='lc-section-help'>{perf_explainer}</div>" | |
| f"<div class='lc-perf'>{items_html}</div></div>" | |
| ) | |
| # ---- KV cache per request ----------------------------------------------- | |
| kv_section = "" | |
| if report.kv_cache_by_context: | |
| rows = [] | |
| for ctx, av in sorted(report.kv_cache_by_context.items()): | |
| rows.append( | |
| f"<tr><td>{ctx:,}</td><td>{_fmt_bytes(av.value)}</td>" | |
| f"<td>{_label_chip(av.label.value)}</td></tr>" | |
| ) | |
| kv_explainer = ( | |
| "单个请求在不同 context 长度下需要多少 KV 缓存。这是决定一张 GPU 能并发跑多少请求的关键。" | |
| "MLA / MQA 模型这里会比标准 GQA 小很多。" | |
| if is_zh | |
| else "How much KV cache one request consumes at each context length. " | |
| "This is what limits per-GPU concurrency. MLA / MQA models are " | |
| "dramatically smaller here than standard GQA." | |
| ) | |
| kv_section = ( | |
| f"<div class='lc-section'>" | |
| f"<h3>{'KV 缓存(每请求)' if is_zh else 'KV cache per request'}</h3>" | |
| f"<div class='lc-section-help'>{kv_explainer}</div>" | |
| f"<table class='lc-table lc-table-recon'>" | |
| f"<thead><tr><th>{'Context tokens' if not is_zh else 'Context 长度'}</th>" | |
| f"<th>{'KV bytes' if not is_zh else 'KV 字节'}</th>" | |
| f"<th>{'Label' if not is_zh else '标签'}</th></tr></thead>" | |
| f"<tbody>{''.join(rows)}</tbody></table></div>" | |
| ) | |
| # ---- Engine compatibility ----------------------------------------------- | |
| engine_section = "" | |
| em = report.engine_match | |
| if em: | |
| def _fmt_flag(f) -> str: # noqa: ANN001 | |
| base = f"{f.flag} {f.value}".strip() | |
| return base | |
| flags = ", ".join(_fmt_flag(f) for f in em.required_flags) if em.required_flags else "—" | |
| opt_flags = ", ".join(_fmt_flag(f) for f in em.optional_flags) if em.optional_flags else "—" | |
| caveats = em.caveats_zh if is_zh else em.caveats_en | |
| sources_html = "—" | |
| if em.sources: | |
| sources_html = "<br>".join( | |
| f'<a href="{_esc(s.url)}" target="_blank" rel="noopener">{_esc(s.url)}</a>' | |
| + ( | |
| f" <span class='lc-prov'>({_esc(s.captured_date)})</span>" | |
| if s.captured_date | |
| else "" | |
| ) | |
| for s in em.sources | |
| ) | |
| rows = [ | |
| (("引擎" if is_zh else "Engine"), f"<code>{_esc(em.engine)}</code>"), | |
| ( | |
| ("版本要求" if is_zh else "Version"), | |
| f"<code>{_esc(em.version_spec)}</code>", | |
| ), | |
| ( | |
| ("支持级别" if is_zh else "Support"), | |
| _label_chip(em.support) if em.support in {"verified", "cited", "unverified"} else f"<code>{_esc(em.support)}</code>", | |
| ), | |
| ( | |
| ("验证级别" if is_zh else "Verification"), | |
| _label_chip(em.verification_level), | |
| ), | |
| (("必需 flag" if is_zh else "Required flags"), f"<code>{_esc(flags)}</code>"), | |
| (("可选 flag" if is_zh else "Optional flags"), f"<code>{_esc(opt_flags)}</code>"), | |
| ] | |
| if caveats: | |
| rows.append((("注意事项" if is_zh else "Caveats"), _esc(caveats))) | |
| rows.append((("来源" if is_zh else "Sources"), sources_html)) | |
| body = "".join(f"<tr><td>{k}</td><td>{v}</td></tr>" for k, v in rows) | |
| engine_explainer = ( | |
| "这个模型在 vLLM/SGLang 哪个版本起能跑、需要哪些必需 flag、有哪些优化 flag。" | |
| "verification_level 标 cited 表示从 PR / release note 引用,verified 表示实测过。" | |
| if is_zh | |
| else "Which engine version supports this model, what flags are required, " | |
| "and which optional flags help. verification_level=cited means we got it " | |
| "from a PR or release note; verified means we actually ran it." | |
| ) | |
| engine_section = ( | |
| f"<div class='lc-section'>" | |
| f"<h3>{'引擎兼容性' if is_zh else 'Engine compatibility'}</h3>" | |
| f"<div class='lc-section-help'>{engine_explainer}</div>" | |
| f"<table class='lc-table'>{body}</table></div>" | |
| ) | |
| # ---- GPU spec ------------------------------------------------------------ | |
| gpu_section = "" | |
| g = report.gpu_spec | |
| if g: | |
| notes = g.notes_zh if is_zh else g.notes_en | |
| rows = [ | |
| ("HBM", f"{g.memory_gb} GB"), | |
| ("Memory BW", f"{g.memory_bandwidth_gbps or '—'} GB/s"), | |
| ("NVLink BW", f"{g.nvlink_bandwidth_gbps} GB/s"), | |
| ("FP16 TFLOPS", f"{g.fp16_tflops}"), | |
| ("FP8", "✓" if g.fp8_support else "—"), | |
| ("FP4", "✓" if g.fp4_support else "—"), | |
| ] | |
| rows_html = "".join( | |
| f"<tr><td>{_esc(k)}</td><td><code>{_esc(v)}</code></td></tr>" | |
| for k, v in rows | |
| ) | |
| notes_html = ( | |
| f"<div class='lc-prov' style='margin-top:8px'>{_esc(notes)}</div>" if notes else "" | |
| ) | |
| source_html = ( | |
| f"<div class='lc-prov'>{'来源' if is_zh else 'Source'}: " | |
| f"<a href='{_esc(g.spec_source)}' target='_blank' rel='noopener'>" | |
| f"{_esc(g.spec_source)}</a></div>" | |
| if g.spec_source and g.spec_source.startswith("http") | |
| else (f"<div class='lc-prov'>{_esc(g.spec_source)}</div>" if g.spec_source else "") | |
| ) | |
| gpu_explainer = ( | |
| "目标 GPU 的硬件规格。Memory BW 决定 decode 能跑多快,FP8/FP4 支持决定能用什么量化。" | |
| if is_zh | |
| else "Hardware spec of the chosen GPU. Memory BW caps decode throughput; " | |
| "FP8/FP4 support determines which quantization paths actually accelerate." | |
| ) | |
| gpu_section = ( | |
| f"<div class='lc-section'>" | |
| f"<h3>{'目标 GPU 规格' if is_zh else 'Target GPU spec'} — <code>{_esc(g.id)}</code></h3>" | |
| f"<div class='lc-section-help'>{gpu_explainer}</div>" | |
| f"<table class='lc-table'>{rows_html}</table>" | |
| f"{notes_html}{source_html}" | |
| f"</div>" | |
| ) | |
| # ---- Generated command --------------------------------------------------- | |
| cmd_section = "" | |
| if report.generated_command: | |
| cmd_explainer = ( | |
| "可以直接复制粘贴到带显卡的机器上跑。flag 是按推荐 tier 的 GPU 数 + 引擎兼容矩阵的必需 flag 自动拼的。" | |
| if is_zh | |
| else "Copy-pasteable on a machine with the right GPUs. Flags auto-assembled " | |
| "from the recommended fleet tier + engine compat matrix's required flags." | |
| ) | |
| cmd_section = ( | |
| f"<div class='lc-section'>" | |
| f"<h3>{'生成命令' if is_zh else 'Generated command'}</h3>" | |
| f"<div class='lc-section-help'>{cmd_explainer}</div>" | |
| f"<pre class='lc-cmd'><code>{_esc(report.generated_command)}</code></pre></div>" | |
| ) | |
| return ( | |
| "<div class='lc-result'>" | |
| + headline | |
| + arch_section | |
| + gpu_section | |
| + recon_section | |
| + kv_section | |
| + fleet_section | |
| + perf_section | |
| + engine_section | |
| + cmd_section | |
| + _render_star_cta(is_zh) | |
| + "</div>" | |
| ) | |
| def _render_compare(reports: list[EvaluationReport], locale: str) -> str: | |
| """Side-by-side comparison of N >= 2 reports for the same model on | |
| different GPUs. | |
| Each metric column declares whether higher or lower is better and we | |
| paint the winner cell in green so the eye snaps to it. | |
| """ | |
| set_locale(locale) # type: ignore[arg-type] | |
| is_zh = locale == "zh" | |
| # All reports share the same model_id + engine — pull from the first. | |
| head = reports[0] | |
| title = ( | |
| f"<div class='lc-header'>" | |
| f"<div class='lc-title'>{_esc(head.model_id)}</div>" | |
| f"<div class='lc-subtitle'>" | |
| f"{('对比 ' + str(len(reports)) + ' 张 GPU') if is_zh else ('Comparing ' + str(len(reports)) + ' GPUs')}" | |
| f" · {_esc(head.engine)}" | |
| f"</div></div>" | |
| ) | |
| # Metric definitions: (label_en, label_zh, value_fn, better=lower|higher|info, formatter) | |
| # "info" rows are not contested — used for model-determined facts (same across | |
| # GPUs by construction) or for descriptive cells like Bottleneck. | |
| def _max_concurrent(r: EvaluationReport) -> int | None: | |
| if not r.fleet: | |
| return None | |
| prod = next((o for o in r.fleet.options if o.tier == "prod"), None) | |
| return prod.max_concurrent_at_reference_ctx if prod else None | |
| def _prod_gpu_count(r: EvaluationReport) -> int | None: | |
| if not r.fleet: | |
| return None | |
| prod = next((o for o in r.fleet.options if o.tier == "prod"), None) | |
| return prod.gpu_count if prod else None | |
| def _kv_per_user_128k(r: EvaluationReport) -> int | None: | |
| av = r.kv_cache_by_context.get(131072) | |
| return av.value if av is not None else None | |
| def _native_precision_score(r: EvaluationReport) -> int | None: | |
| g = r.gpu_spec | |
| if g is None: | |
| return None | |
| return (1 if g.fp8_support else 0) + (1 if g.fp4_support else 0) | |
| def _fmt_native(v: int | None) -> str: | |
| if v is None: | |
| return "—" | |
| return {0: "FP16 only", 1: "FP8", 2: "FP8 + FP4"}.get(v, str(v)) | |
| def _max_context_tokens(r: EvaluationReport) -> int | None: | |
| """Effective max context the model claims to support. | |
| In modern HF configs (LLaMA 3+, DeepSeek V3+, Qwen2.5+), the field | |
| max_position_embeddings already reflects the post-RoPE/YaRN-scaling | |
| window. rope_scaling_factor is recorded for provenance but must NOT | |
| be multiplied in again — that double-counts. | |
| """ | |
| pos = r.profile.position | |
| if pos is None or pos.max_position_embeddings is None: | |
| return None | |
| return int(pos.max_position_embeddings) | |
| def _fmt_context(v: int | None) -> str: | |
| """Binary-base formatting so 131072 reads as '128K' not '131K'.""" | |
| if v is None: | |
| return "—" | |
| if v >= 1024 * 1024: | |
| return f"{v / (1024 * 1024):.1f}M".replace(".0M", "M") | |
| if v >= 1024: | |
| return f"{v // 1024}K" | |
| return str(v) | |
| def _cluster_qps(r: EvaluationReport) -> float | None: | |
| """Steady-state queries/sec the cluster sustains: | |
| QPS = cluster_decode_tokens_per_sec / output_tokens_per_request.""" | |
| if not r.decode or r.decode.cluster_tokens_per_sec.value <= 0: | |
| return None | |
| out = r.perf_output_tokens or 512 | |
| if out <= 0: | |
| return None | |
| return r.decode.cluster_tokens_per_sec.value / out | |
| metrics = [ | |
| # ── Model-determined rows (info; identical across GPUs by definition) ── | |
| ("Quantization", "量化方案", | |
| lambda r: r.weight.quantization_guess.value, "info", | |
| lambda v: _esc(str(v)) if v else "—"), | |
| ("Weights total", "权重总量", | |
| lambda r: r.weight.total_bytes.value, "info", | |
| lambda v: _fmt_bytes(v) if v else "—"), | |
| ("KV / user @ 128K", "KV / 用户 @ 128K", | |
| _kv_per_user_128k, "info", | |
| lambda v: _fmt_bytes(v) if v is not None else "—"), | |
| ("Max context", "最大上下文", | |
| _max_context_tokens, "info", | |
| _fmt_context), | |
| # ── GPU hardware specs (contested) ── | |
| ("HBM / card", "单卡显存", | |
| lambda r: r.gpu_spec.memory_gb if r.gpu_spec else None, "higher", | |
| lambda v: f"{v} GB" if v is not None else "—"), | |
| ("HBM bandwidth", "显存带宽", | |
| lambda r: r.gpu_spec.memory_bandwidth_gbps if r.gpu_spec else None, "higher", | |
| lambda v: f"{v:,} GB/s" if v is not None else "—"), | |
| ("NVLink / card", "NVLink 带宽", | |
| lambda r: r.gpu_spec.nvlink_bandwidth_gbps if r.gpu_spec else None, "higher", | |
| lambda v: (f"{v} GB/s" if v else "无") if v is not None else "—"), | |
| ("Native FP8/FP4", "原生低精度", | |
| _native_precision_score, "higher", | |
| _fmt_native), | |
| # ── Sizing & performance outcomes (contested) ── | |
| ("Prod GPUs", "生产档 GPU 数", | |
| _prod_gpu_count, "lower", | |
| lambda v: str(v) if v is not None else "—"), | |
| ("Users @ 128K", "用户 @ 128K", | |
| _max_concurrent, "higher", | |
| lambda v: str(v) if v is not None else "—"), | |
| ("Prefill latency", "Prefill 延迟", | |
| lambda r: r.prefill.latency_ms.value if r.prefill else None, "lower", | |
| lambda v: f"{v:.0f} ms" if v is not None else "—"), | |
| ("Per-GPU decode", "单卡 decode 吞吐", | |
| lambda r: r.decode.per_gpu_tokens_per_sec.value if r.decode else None, "higher", | |
| lambda v: f"{v:.0f} tok/s" if v is not None else "—"), | |
| ("Cluster decode", "集群 decode 吞吐", | |
| lambda r: r.decode.cluster_tokens_per_sec.value if r.decode else None, "higher", | |
| lambda v: f"{v:.0f} tok/s" if v is not None else "—"), | |
| ("Sustained QPS", "稳态 QPS", | |
| _cluster_qps, "higher", | |
| lambda v: f"{v:.2f} q/s" if v is not None else "—"), | |
| # ── Diagnostic (info — string, not a number race) ── | |
| ("Bottleneck", "瓶颈", | |
| lambda r: r.concurrency.bottleneck if r.concurrency else None, "info", | |
| lambda v: f"<code>{_esc(str(v))}</code>" if v else "—"), | |
| ] | |
| # GPU column headers | |
| gpu_headers = "".join( | |
| f"<th class='lc-cmp-gpu'>{_esc(r.gpu)}</th>" for r in reports | |
| ) | |
| rows_html = [] | |
| for label_en, label_zh, getter, better, fmt in metrics: | |
| values = [getter(r) for r in reports] | |
| # Pick the winning index. None values are excluded from the contest. | |
| winner_idx: int | None = None | |
| if better in ("higher", "lower"): | |
| numeric_pairs = [(i, v) for i, v in enumerate(values) if isinstance(v, (int, float))] | |
| if numeric_pairs: | |
| if better == "higher": | |
| winner_idx = max(numeric_pairs, key=lambda p: p[1])[0] | |
| else: | |
| winner_idx = min(numeric_pairs, key=lambda p: p[1])[0] | |
| # If all values are equal, no winner (avoid arbitrary-tiebreak gold star) | |
| vals_set = {v for _, v in numeric_pairs} | |
| if len(vals_set) <= 1: | |
| winner_idx = None | |
| cells = [] | |
| for i, v in enumerate(values): | |
| cls = " class='lc-cmp-winner'" if i == winner_idx else "" | |
| cells.append(f"<td{cls}>{fmt(v)}</td>") | |
| label = label_zh if is_zh else label_en | |
| # Tag info rows so the eye knows "this is a model fact, not a contest". | |
| is_info = better == "info" | |
| label_cls = "lc-cmp-row-label lc-cmp-row-info" if is_info else "lc-cmp-row-label" | |
| tr_cls = " class='lc-cmp-tr-info'" if is_info else "" | |
| rows_html.append( | |
| f"<tr{tr_cls}><th class='{label_cls}'>{_esc(label)}</th>{''.join(cells)}</tr>" | |
| ) | |
| # Aggregate winner — count column wins across "higher/lower" metrics | |
| win_counts = [0] * len(reports) | |
| for label_en, label_zh, getter, better, fmt in metrics: | |
| if better == "info": | |
| continue | |
| values = [getter(r) for r in reports] | |
| numeric_pairs = [(i, v) for i, v in enumerate(values) if isinstance(v, (int, float))] | |
| if not numeric_pairs: | |
| continue | |
| vals_set = {v for _, v in numeric_pairs} | |
| if len(vals_set) <= 1: | |
| continue | |
| if better == "higher": | |
| winner_idx = max(numeric_pairs, key=lambda p: p[1])[0] | |
| else: | |
| winner_idx = min(numeric_pairs, key=lambda p: p[1])[0] | |
| win_counts[winner_idx] += 1 | |
| overall_text = "" | |
| if any(win_counts): | |
| max_wins = max(win_counts) | |
| leaders = [reports[i].gpu for i, c in enumerate(win_counts) if c == max_wins] | |
| if len(leaders) == 1: | |
| overall_text = ( | |
| f"<div class='lc-cmp-summary'>" | |
| f"{'综合最优' if is_zh else 'Overall winner'}: " | |
| f"<strong>{_esc(leaders[0])}</strong> " | |
| f"({max_wins}/{sum(1 for m in metrics if m[3] != 'info')} " | |
| f"{'指标领先' if is_zh else 'metrics lead'})" | |
| f"</div>" | |
| ) | |
| else: | |
| overall_text = ( | |
| f"<div class='lc-cmp-summary'>" | |
| f"{'势均力敌' if is_zh else 'Tied'}: " | |
| f"<strong>{_esc(' / '.join(leaders))}</strong>" | |
| f"</div>" | |
| ) | |
| table = ( | |
| f"<div class='lc-section'>" | |
| f"<h3>{'对比' if is_zh else 'Side-by-side comparison'}</h3>" | |
| f"<div class='lc-cmp-wrap'>" | |
| f"<table class='lc-cmp-table'>" | |
| f"<thead><tr>" | |
| f"<th class='lc-cmp-row-label'></th>" | |
| f"{gpu_headers}" | |
| f"</tr></thead>" | |
| f"<tbody>{''.join(rows_html)}</tbody>" | |
| f"</table></div>" | |
| f"{overall_text}" | |
| f"</div>" | |
| ) | |
| # Per-GPU detail headlines (small stat cards) below the table | |
| detail_blocks = [] | |
| for r in reports: | |
| weight_str = _fmt_bytes(r.weight.total_bytes.value) | |
| prod = _prod_gpu_count(r) | |
| users = _max_concurrent(r) | |
| detail_blocks.append( | |
| f"<div class='lc-cmp-detail'>" | |
| f"<div class='lc-cmp-detail-gpu'>{_esc(r.gpu)}</div>" | |
| f"<div class='lc-cmp-detail-row'>" | |
| f"<span>{'权重' if is_zh else 'Weight'}</span><strong>{weight_str}</strong></div>" | |
| f"<div class='lc-cmp-detail-row'>" | |
| f"<span>{'生产 GPU' if is_zh else 'Prod GPUs'}</span>" | |
| f"<strong>{prod if prod is not None else '—'}</strong></div>" | |
| f"<div class='lc-cmp-detail-row'>" | |
| f"<span>{'用户 @ 128K' if is_zh else 'Users @ 128K'}</span>" | |
| f"<strong>{users if users is not None else '—'}</strong></div>" | |
| f"</div>" | |
| ) | |
| detail_section = ( | |
| f"<div class='lc-section'>" | |
| f"<h3>{'各档详情' if is_zh else 'Per-GPU detail'}</h3>" | |
| f"<div class='lc-cmp-details'>{''.join(detail_blocks)}</div>" | |
| f"</div>" | |
| ) | |
| return ( | |
| "<div class='lc-result'>" | |
| + title | |
| + table | |
| + detail_section | |
| + _render_star_cta(is_zh) | |
| + "</div>" | |
| ) | |
| def _render_star_cta(is_zh: bool) -> str: | |
| """Tail-of-result CTA — shown right after the user got their answer, | |
| which is when satisfaction is highest and the GitHub star ask reads as | |
| 'thanks for the tool' rather than 'please give me attention'.""" | |
| en_msg = "Saved you GPU-sizing math?" | |
| zh_msg = "省了你 GPU 选型的时间?" | |
| cta_en = "Star on GitHub" | |
| cta_zh = "给个 Star" | |
| text_top = zh_msg if is_zh else en_msg | |
| text_bottom = en_msg if is_zh else zh_msg | |
| cta = f"{cta_zh if is_zh else cta_en} · {cta_en if is_zh else cta_zh}" | |
| return ( | |
| "<a class='lc-star-cta' href='https://github.com/FlyTOmeLight/llm-cal' " | |
| "target='_blank' rel='noopener'>" | |
| "<svg viewBox='0 0 16 16' width='18' height='18' aria-hidden='true' fill='currentColor'>" | |
| "<path d='M8 0C3.58 0 0 3.58 0 8a8 8 0 0 0 5.47 7.59c.4.07.55-.17.55-.38v-1.33c-2.22.48-2.69-1.07-2.69-1.07-.36-.92-.89-1.17-.89-1.17-.73-.5.06-.49.06-.49.81.06 1.23.83 1.23.83.72 1.23 1.88.87 2.34.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.83-2.15-.08-.2-.36-1.02.08-2.13 0 0 .67-.21 2.2.82a7.6 7.6 0 0 1 4 0c1.53-1.04 2.2-.82 2.2-.82.44 1.11.16 1.93.08 2.13.51.56.83 1.27.83 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48v2.19c0 .21.15.46.55.38A8 8 0 0 0 16 8c0-4.42-3.58-8-8-8z'/>" | |
| "</svg>" | |
| f"<div class='lc-star-cta-text'>" | |
| f"<div class='lc-star-cta-q'>{text_top}</div>" | |
| f"<div class='lc-star-cta-q-en'>{text_bottom}</div>" | |
| f"</div>" | |
| f"<div class='lc-star-cta-action'>{cta} →</div>" | |
| "</a>" | |
| ) | |
| def _render_explain(entries: list[ExplainEntry], is_zh: bool) -> str: | |
| """Render --explain derivation trace as an HTML accordion.""" | |
| if not entries: | |
| return "" | |
| blocks = [] | |
| for e in entries: | |
| inputs_html = "" | |
| if e.inputs: | |
| inputs_html = "<ul class='lc-explain-inputs'>" + "".join( | |
| f"<li><code>{_esc(inp.name)}</code> = " | |
| f"<strong>{_esc(inp.value)}</strong> " | |
| f"<span class='lc-explain-label'>{_esc(inp.label)}</span>" | |
| + (f" — <em>{_esc(inp.note)}</em>" if inp.note else "") | |
| + "</li>" | |
| for inp in e.inputs | |
| ) + "</ul>" | |
| steps_html = "" | |
| if e.steps: | |
| steps_html = "<ol class='lc-explain-steps'>" + "".join( | |
| f"<li>{_esc(s)}</li>" for s in e.steps | |
| ) + "</ol>" | |
| source_html = ( | |
| f"<div class='lc-prov'>{'来源' if is_zh else 'Source'}: {_esc(e.source)}</div>" | |
| if e.source | |
| else "" | |
| ) | |
| blocks.append( | |
| f"<div class='lc-explain-entry'>" | |
| f"<div class='lc-explain-heading'>{_esc(e.heading)}</div>" | |
| f"<div class='lc-explain-formula'><code>{_esc(e.formula)}</code></div>" | |
| f"{inputs_html}{steps_html}" | |
| f"<div class='lc-explain-result'>" | |
| f"{'结果' if is_zh else 'Result'}: <strong>{_esc(e.result)}</strong></div>" | |
| f"{source_html}" | |
| f"</div>" | |
| ) | |
| return ( | |
| "<div class='lc-result'>" | |
| f"<div class='lc-section'>" | |
| f"<h3>{'推导链 (--explain)' if is_zh else 'Derivation trace (--explain)'}</h3>" | |
| + "".join(blocks) | |
| + "</div></div>" | |
| ) | |
| def _render_llm_review(content: str | None, error: str | None, model: str, is_zh: bool) -> str: | |
| if error: | |
| return _render_error(f"LLM review: {error}", is_zh) | |
| if not content: | |
| return "" | |
| # The LLM responds with markdown — convert to a simple HTML block for display. | |
| # gr.HTML doesn't run markdown, but the LLM's headers (## ...) still read OK as text. | |
| safe = _esc(content).replace("\n", "<br>") | |
| return ( | |
| "<div class='lc-result'>" | |
| f"<div class='lc-section'>" | |
| f"<h3>{'LLM 审计 (--llm-review)' if is_zh else 'LLM review (--llm-review)'} " | |
| f"<span class='lc-llm-model'>{_esc(model)}</span></h3>" | |
| f"<div class='lc-llm-banner'>" | |
| f"{_label_chip('llm-opinion')} " | |
| f"{'仅供参考,不覆盖前 6 个 label' if is_zh else 'Second opinion — never overrides the 6 primary labels'}" | |
| f"</div>" | |
| f"<div class='lc-llm-content'>{safe}</div>" | |
| f"</div></div>" | |
| ) | |
| def _render_error(msg: str, is_zh: bool) -> str: | |
| label = "出错了" if is_zh else "Error" | |
| return ( | |
| f"<div class='lc-result lc-error'>" | |
| f"<h3>{label}</h3>" | |
| f"<pre>{_esc(msg)}</pre></div>" | |
| ) | |
| def _render_loading(is_zh: bool) -> str: | |
| msg = ( | |
| "正在拉取模型元数据 + 读 safetensors header… 首次大模型约 3-8 秒" | |
| if is_zh | |
| else "Fetching model metadata + reading safetensors header… " | |
| "first lookup of a large model takes 3-8 seconds" | |
| ) | |
| return ( | |
| "<div class='lc-result lc-loading'>" | |
| "<div class='lc-spinner'></div>" | |
| f"<div class='lc-loading-text'>{msg}</div>" | |
| "</div>" | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Backend handler | |
| _evaluators: dict[str, Evaluator] = {} | |
| def _get_evaluator(source_key: str) -> Evaluator: | |
| """One evaluator per source — Evaluator caches an HfApi client internally | |
| so we don't want to rebuild it every keystroke.""" | |
| if source_key not in _evaluators: | |
| if source_key == "modelscope": | |
| _evaluators[source_key] = Evaluator(source=ModelScopeSource()) | |
| else: | |
| _evaluators[source_key] = Evaluator(source=HuggingFaceSource()) | |
| return _evaluators[source_key] | |
| def calculate( | |
| model_id: str, | |
| gpu, # list[str] from multiselect; str also tolerated # noqa: ANN001 | |
| engine: str, | |
| context_length: int | None, | |
| lang: str, | |
| source: str, | |
| gpu_count: int | None, | |
| input_tokens: int, | |
| output_tokens: int, | |
| target_tps: float, | |
| prefill_util: float, | |
| decode_bw_util: float, | |
| concurrency_degradation: float, | |
| refresh: bool, | |
| explain: bool, | |
| llm_review: bool, | |
| hf_token: str, | |
| ms_token: str, | |
| llm_api_key: str, | |
| llm_base_url: str, | |
| llm_model: str, | |
| ) -> tuple[str, str, str]: | |
| """Returns (main_html, explain_html, llm_review_html).""" | |
| locale = "zh" if lang.startswith("中") else "en" | |
| is_zh = locale == "zh" | |
| # Normalize GPU input. Multiselect returns list; defensive coerce for safety. | |
| if isinstance(gpu, str): | |
| gpu_list = [gpu] if gpu else [] | |
| elif isinstance(gpu, (list, tuple)): | |
| gpu_list = [g for g in gpu if g] | |
| else: | |
| gpu_list = [] | |
| if not model_id or not model_id.strip(): | |
| return ( | |
| _render_error( | |
| "请输入模型 ID" if is_zh else "Enter a model id", | |
| is_zh, | |
| ), | |
| "", | |
| "", | |
| ) | |
| if not gpu_list: | |
| return (_render_error("请选择 GPU" if is_zh else "Pick a GPU", is_zh), "", "") | |
| is_compare = len(gpu_list) >= 2 | |
| # Resolve source key. The radio shows e.g. "HuggingFace" / "ModelScope". | |
| src_key = "modelscope" if "modelscope" in source.lower() else "huggingface" | |
| # Inject user-provided tokens into env for the duration of this call only. | |
| # We restore the prior values in the finally block so a token entered for | |
| # one model doesn't leak into the next request from a different user. | |
| token_env_keys = ( | |
| "HF_TOKEN", | |
| "HUGGING_FACE_HUB_TOKEN", | |
| "MODELSCOPE_API_TOKEN", | |
| "MODELSCOPE_TOKEN", | |
| ) | |
| old_token_env = {k: os.environ.get(k) for k in token_env_keys} | |
| if hf_token and hf_token.strip(): | |
| os.environ["HF_TOKEN"] = hf_token.strip() | |
| if ms_token and ms_token.strip(): | |
| os.environ["MODELSCOPE_API_TOKEN"] = ms_token.strip() | |
| def _eval_one(g: str) -> EvaluationReport: | |
| return _get_evaluator(src_key).evaluate( | |
| model_id=model_id.strip(), | |
| gpu=g, | |
| engine=engine, | |
| gpu_count=gpu_count if gpu_count and gpu_count > 0 else None, | |
| context_length=context_length if context_length and context_length > 0 else None, | |
| refresh=refresh, | |
| input_tokens=int(input_tokens) if input_tokens else 2000, | |
| output_tokens=int(output_tokens) if output_tokens else 512, | |
| target_tokens_per_sec=float(target_tps) if target_tps else 30.0, | |
| prefill_utilization=float(prefill_util) if prefill_util else 0.40, | |
| decode_bw_utilization=float(decode_bw_util) if decode_bw_util else 0.50, | |
| concurrency_degradation=( | |
| float(concurrency_degradation) if concurrency_degradation else 1.0 | |
| ), | |
| ) | |
| try: | |
| # ---- Compare path: 2-4 GPUs -------------------------------------- | |
| if is_compare: | |
| try: | |
| reports = [_eval_one(g) for g in gpu_list] | |
| except Exception as e: # noqa: BLE001 | |
| return (_render_error(f"{type(e).__name__}: {e}", is_zh), "", "") | |
| return _render_compare(reports, locale), "", "" | |
| # ---- Single-GPU path (existing flow) ------------------------------ | |
| try: | |
| report = _eval_one(gpu_list[0]) | |
| except Exception as e: # noqa: BLE001 | |
| return (_render_error(f"{type(e).__name__}: {e}", is_zh), "", "") | |
| main_html = _render(report, locale) | |
| explain_html = "" | |
| llm_html = "" | |
| if explain or llm_review: | |
| entries = build_explain(report) | |
| if explain: | |
| explain_html = _render_explain(entries, is_zh) | |
| if llm_review: | |
| # Only set env vars if user actually provided them — never persist | |
| # them in env beyond this call's scope (they live in process env | |
| # for the duration of the call, but we don't persist to disk). | |
| old_env = { | |
| "LLM_CAL_REVIEWER_API_KEY": os.environ.get("LLM_CAL_REVIEWER_API_KEY"), | |
| "LLM_CAL_REVIEWER_BASE_URL": os.environ.get("LLM_CAL_REVIEWER_BASE_URL"), | |
| "LLM_CAL_REVIEWER_MODEL": os.environ.get("LLM_CAL_REVIEWER_MODEL"), | |
| } | |
| try: | |
| if llm_api_key.strip(): | |
| os.environ["LLM_CAL_REVIEWER_API_KEY"] = llm_api_key.strip() | |
| if llm_base_url.strip(): | |
| os.environ["LLM_CAL_REVIEWER_BASE_URL"] = llm_base_url.strip() | |
| if llm_model.strip(): | |
| os.environ["LLM_CAL_REVIEWER_MODEL"] = llm_model.strip() | |
| result = run_review(entries, locale=locale) # type: ignore[arg-type] | |
| finally: | |
| for k, v in old_env.items(): | |
| if v is None: | |
| os.environ.pop(k, None) | |
| else: | |
| os.environ[k] = v | |
| llm_html = _render_llm_review(result.content, result.error, result.model, is_zh) | |
| return main_html, explain_html, llm_html | |
| finally: | |
| for k, v in old_token_env.items(): | |
| if v is None: | |
| os.environ.pop(k, None) | |
| else: | |
| os.environ[k] = v | |
| def show_loading(lang: str) -> tuple[str, str, str]: | |
| is_zh = lang.startswith("中") | |
| return _render_loading(is_zh), "", "" | |
| # --------------------------------------------------------------------------- | |
| # UI | |
| THEME = gr.themes.Soft(primary_hue="indigo") | |
| HERO_HTML = """ | |
| <div class='lc-hero'> | |
| <div class='lc-hero-top'> | |
| <div class='lc-hero-titleblock'> | |
| <div class='lc-hero-title'>llm-cal</div> | |
| <div class='lc-hero-tagline'> | |
| LLM inference hardware calculator · 大模型推理硬件计算器<br> | |
| Architecture-aware · Engine-aware · <strong>Honest-labeled</strong> | |
| </div> | |
| </div> | |
| <a class='lc-hero-gh' href='https://github.com/FlyTOmeLight/llm-cal' target='_blank' rel='noopener'> | |
| <svg viewBox='0 0 16 16' width='16' height='16' aria-hidden='true' fill='currentColor'> | |
| <path d='M8 0C3.58 0 0 3.58 0 8a8 8 0 0 0 5.47 7.59c.4.07.55-.17.55-.38v-1.33c-2.22.48-2.69-1.07-2.69-1.07-.36-.92-.89-1.17-.89-1.17-.73-.5.06-.49.06-.49.81.06 1.23.83 1.23.83.72 1.23 1.88.87 2.34.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.83-2.15-.08-.2-.36-1.02.08-2.13 0 0 .67-.21 2.2.82a7.6 7.6 0 0 1 4 0c1.53-1.04 2.2-.82 2.2-.82.44 1.11.16 1.93.08 2.13.51.56.83 1.27.83 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48v2.19c0 .21.15.46.55.38A8 8 0 0 0 16 8c0-4.42-3.58-8-8-8z'/> | |
| </svg> | |
| <span class='lc-hero-gh-text'>GitHub</span> | |
| <img class='lc-hero-gh-stars' alt='stars' | |
| src='https://img.shields.io/github/stars/FlyTOmeLight/llm-cal?style=flat-square&logo=&label=&color=eef2ff&labelColor=eef2ff' | |
| loading='lazy' /> | |
| </a> | |
| </div> | |
| <div class='lc-hero-pitch'> | |
| <div class='lc-pitch-card lc-pitch-bad'> | |
| <div class='lc-pitch-tool'>gpu_poor</div> | |
| <div class='lc-pitch-num-bad'>284 GB</div> | |
| <div class='lc-pitch-method'>assumes pure FP8 · 假设纯 FP8</div> | |
| </div> | |
| <div class='lc-pitch-arrow'>→</div> | |
| <div class='lc-pitch-card lc-pitch-good'> | |
| <div class='lc-pitch-tool'>llm-cal</div> | |
| <div class='lc-pitch-num-good'>160 GB</div> | |
| <div class='lc-pitch-method'>reads real safetensors bytes · 读真实字节</div> | |
| </div> | |
| <div class='lc-pitch-summary'> | |
| <div class='lc-pitch-model'>DeepSeek-V4-Flash · H800</div> | |
| <div class='lc-pitch-result'>0.2% error vs 45% · 误差 0.2% vs 45%</div> | |
| </div> | |
| </div> | |
| </div> | |
| """ | |
| CUSTOM_CSS = """ | |
| /* Font stack — system fonts in both English + Chinese, no Gradio default serif */ | |
| * { | |
| font-family: -apple-system, BlinkMacSystemFont, "Inter", "Helvetica Neue", | |
| "PingFang SC", "Microsoft YaHei", "Segoe UI", Roboto, Arial, sans-serif !important; | |
| } | |
| /* Hide Gradio's default footer chrome that looks like part of our app */ | |
| footer { display: none !important; } | |
| .show-api, .built-with, .settings { display: none !important; } | |
| /* Tighter overall padding + center on wide screens — without margin:auto the | |
| container left-aligns and leaves ~800px empty on 1920+ displays. | |
| width:100% makes it shrink to viewport when narrower than max-width | |
| (otherwise on mobile align-items:stretch + max-width overflows). */ | |
| .gradio-container { | |
| max-width: 1100px !important; | |
| width: 100% !important; | |
| margin-left: auto !important; | |
| margin-right: auto !important; | |
| } | |
| /* Hero section */ | |
| .lc-hero { | |
| margin: 8px 0 24px 0; | |
| padding: 24px 0 18px 0; | |
| border-bottom: 1px solid #e5e7eb; | |
| } | |
| .dark .lc-hero { border-bottom-color: #374151; } | |
| /* Top row: title block (left) + GitHub link (right). On mobile the GH link | |
| wraps to its own line above or below the title — order kept so it stays | |
| visible above the fold. */ | |
| .lc-hero-top { | |
| display: flex; | |
| align-items: flex-start; | |
| justify-content: space-between; | |
| gap: 16px; | |
| flex-wrap: wrap; | |
| margin-bottom: 14px; | |
| } | |
| .lc-hero-titleblock { | |
| flex: 1 1 320px; | |
| min-width: 0; | |
| } | |
| .lc-hero-gh { | |
| display: inline-flex; | |
| align-items: center; | |
| gap: 8px; | |
| padding: 6px 12px; | |
| border: 1px solid #c7d2fe; | |
| background: #eef2ff; | |
| border-radius: 999px; | |
| font-size: 13px !important; | |
| font-weight: 600 !important; | |
| color: #4338ca !important; | |
| text-decoration: none !important; | |
| white-space: nowrap; | |
| transition: background 0.15s ease, border-color 0.15s ease; | |
| flex: 0 0 auto; | |
| } | |
| .lc-hero-gh:hover { | |
| background: #e0e7ff; | |
| border-color: #a5b4fc; | |
| } | |
| .dark .lc-hero-gh { | |
| background: #1e1b4b; | |
| border-color: #3730a3; | |
| color: #c7d2fe !important; | |
| } | |
| .dark .lc-hero-gh:hover { background: #312e81; border-color: #4338ca; } | |
| .lc-hero-gh svg { display: block; } | |
| .lc-hero-gh-stars { | |
| height: 18px; | |
| vertical-align: middle; | |
| border-radius: 4px; | |
| } | |
| .lc-hero-title { | |
| font-size: 32px !important; | |
| font-weight: 800 !important; | |
| letter-spacing: -0.02em; | |
| color: #0f172a !important; | |
| margin: 0 !important; | |
| line-height: 1.15; | |
| } | |
| .dark .lc-hero-title { color: #f8fafc !important; } | |
| .lc-hero-tagline { | |
| font-size: 16px !important; | |
| color: #6b7280 !important; | |
| margin: 6px 0 16px 0; | |
| line-height: 1.5; | |
| } | |
| .lc-hero-pitch { | |
| display: grid; | |
| /* 4 cells: bad-card / arrow / good-card / summary on wide screens */ | |
| grid-template-columns: 1fr 30px 1fr 1.2fr; | |
| gap: 14px; | |
| align-items: stretch; | |
| padding: 0; | |
| font-size: 13px !important; | |
| color: #1e293b !important; | |
| } | |
| .dark .lc-hero-pitch { color: #f1f5f9 !important; } | |
| /* Tablet: bad / arrow / good in row 1, summary full-width row 2 */ | |
| @media (max-width: 900px) { | |
| .lc-hero-pitch { | |
| grid-template-columns: 1fr 28px 1fr; | |
| grid-template-rows: auto auto; | |
| } | |
| .lc-pitch-summary { grid-column: 1 / -1; } | |
| } | |
| /* Mobile: stack everything, hide the arrow */ | |
| @media (max-width: 540px) { | |
| .lc-hero-pitch { | |
| grid-template-columns: 1fr; | |
| grid-template-rows: repeat(3, auto); | |
| } | |
| .lc-pitch-arrow { display: none; } | |
| .lc-pitch-summary { grid-column: auto; } | |
| } | |
| .lc-pitch-card { | |
| padding: 14px 18px; | |
| border-radius: 10px; | |
| border: 1px solid #e5e7eb; | |
| background: #ffffff; | |
| display: flex; | |
| flex-direction: column; | |
| justify-content: center; | |
| min-width: 0; | |
| } | |
| .dark .lc-pitch-card { background: #111827; border-color: #374151; } | |
| /* Subtle accent bar on the left, not a screaming red/green border */ | |
| .lc-pitch-bad { border-left: 3px solid #cbd5e1; } | |
| .lc-pitch-good { border-left: 3px solid #4f46e5; } | |
| .dark .lc-pitch-bad { border-left-color: #475569; } | |
| .dark .lc-pitch-good { border-left-color: #818cf8; } | |
| .lc-pitch-tool { | |
| font-size: 12px !important; | |
| font-weight: 600 !important; | |
| color: #6b7280 !important; | |
| font-family: "SF Mono", "JetBrains Mono", Menlo, monospace !important; | |
| margin-bottom: 4px; | |
| } | |
| .lc-pitch-num-bad { font-size: 24px !important; font-weight: 800 !important; color: #b91c1c !important; line-height: 1.1; letter-spacing: -0.01em; } | |
| .lc-pitch-num-good { font-size: 24px !important; font-weight: 800 !important; color: #15803d !important; line-height: 1.1; letter-spacing: -0.01em; } | |
| .dark .lc-pitch-num-bad { color: #f87171 !important; } | |
| .dark .lc-pitch-num-good { color: #4ade80 !important; } | |
| .lc-pitch-method { | |
| font-size: 11px !important; | |
| color: #6b7280 !important; | |
| margin-top: 6px; | |
| line-height: 1.4; | |
| } | |
| .lc-pitch-arrow { | |
| display: flex; | |
| align-items: center; | |
| font-size: 22px !important; | |
| color: #9ca3af !important; | |
| font-weight: 300; | |
| } | |
| .lc-pitch-summary { | |
| flex: 1 1 200px; | |
| padding: 14px 18px; | |
| border-radius: 10px; | |
| background: #eef2ff; | |
| border: 1px solid #c7d2fe; | |
| display: flex; | |
| flex-direction: column; | |
| justify-content: center; | |
| } | |
| .dark .lc-pitch-summary { background: #1e1b4b; border-color: #3730a3; } | |
| .lc-pitch-model { | |
| font-size: 11px !important; | |
| font-weight: 600 !important; | |
| text-transform: uppercase; | |
| letter-spacing: 0.06em; | |
| color: #6366f1 !important; | |
| margin-bottom: 4px; | |
| } | |
| .dark .lc-pitch-model { color: #a5b4fc !important; } | |
| .lc-pitch-result { | |
| font-size: 14px !important; | |
| font-weight: 700 !important; | |
| color: #312e81 !important; | |
| } | |
| .dark .lc-pitch-result { color: #e0e7ff !important; } | |
| /* Primary button — match the indigo theme; constrain width so it's not a billboard */ | |
| button.primary, | |
| button[variant="primary"], | |
| .primary > button { | |
| background: #4f46e5 !important; | |
| border-color: #4f46e5 !important; | |
| color: #ffffff !important; | |
| font-weight: 600 !important; | |
| letter-spacing: 0.01em; | |
| border-radius: 8px !important; | |
| padding: 10px 28px !important; | |
| } | |
| button.primary:hover, | |
| button[variant="primary"]:hover, | |
| .primary > button:hover { background: #4338ca !important; border-color: #4338ca !important; } | |
| /* The wrapper around the Calculate button — center it, give it sane width */ | |
| .lc-submit-wrap { | |
| display: flex !important; | |
| justify-content: center !important; | |
| margin: 20px 0 8px 0 !important; | |
| } | |
| .lc-submit-wrap button { | |
| min-width: 220px !important; | |
| max-width: 320px !important; | |
| width: auto !important; | |
| } | |
| /* Form labels — kill Gradio's purple chip; make labels plain uppercase small text */ | |
| [data-testid="block-info"] { | |
| background: transparent !important; | |
| border: none !important; | |
| padding: 0 !important; | |
| margin: 0 0 6px 0 !important; | |
| font-size: 11px !important; | |
| font-weight: 600 !important; | |
| text-transform: uppercase !important; | |
| letter-spacing: 0.05em !important; | |
| color: #6b7280 !important; | |
| border-radius: 0 !important; | |
| display: block !important; | |
| } | |
| .dark [data-testid="block-info"] { color: #9ca3af !important; } | |
| /* Tooltip / info-text — single line, secondary color, no italic */ | |
| .info-text { | |
| font-size: 11px !important; | |
| color: #94a3b8 !important; | |
| margin: 0 0 4px 0 !important; | |
| line-height: 1.4 !important; | |
| padding: 0 !important; | |
| font-style: normal !important; | |
| white-space: normal !important; | |
| } | |
| .info-text br { display: none !important; } | |
| .dark .info-text { color: #64748b !important; } | |
| /* Kill Gradio's grey form-panel chrome entirely — labels + inputs float on the page */ | |
| .block, | |
| .block.padded, | |
| .block.gradio-container, | |
| .form, | |
| .row, | |
| [data-testid="block"] { | |
| background: transparent !important; | |
| border: none !important; | |
| box-shadow: none !important; | |
| } | |
| .block.padded { padding: 6px 0 !important; } | |
| .form { padding: 0 !important; } | |
| .row { padding: 0 !important; } | |
| /* Tighten row gap so inputs cluster more naturally */ | |
| .form, .row { gap: 16px !important; } | |
| /* Tablet (≤900px): Gradio's gr.Row() flex-direction: row keeps 3 inputs | |
| in one line. min-width: 320px forces 3-column rows to wrap to 2x1 + | |
| 1x1 at this size while leaving 2-column rows at 2-up. */ | |
| @media (max-width: 900px) { | |
| .form, | |
| .row { | |
| flex-wrap: wrap !important; | |
| } | |
| .form > .block, | |
| .row > .block { | |
| flex: 1 1 calc(50% - 12px) !important; | |
| min-width: 320px !important; | |
| max-width: 100% !important; | |
| } | |
| } | |
| /* Mobile (≤540px): single-column form. */ | |
| @media (max-width: 540px) { | |
| .form, | |
| .row { | |
| flex-direction: column !important; | |
| } | |
| .form > .block, | |
| .row > .block { | |
| flex: 1 1 100% !important; | |
| min-width: 0 !important; | |
| width: 100% !important; | |
| } | |
| .gradio-container { padding: 12px !important; } | |
| .lc-hero-title { font-size: 26px !important; } | |
| .lc-pitch-num-bad, .lc-pitch-num-good { font-size: 22px !important; } | |
| .lc-pitch-arrow { display: none !important; } | |
| } | |
| /* Inputs themselves — light border, soft fill */ | |
| input[type="text"], | |
| input[type="number"], | |
| input[type="password"], | |
| textarea, | |
| select { | |
| border: 1px solid #e5e7eb !important; | |
| border-radius: 8px !important; | |
| background: #ffffff !important; | |
| font-size: 14px !important; | |
| padding: 10px 12px !important; | |
| } | |
| .dark input, | |
| .dark textarea, | |
| .dark select { | |
| background: #111827 !important; | |
| border-color: #374151 !important; | |
| } | |
| input:focus, | |
| textarea:focus { | |
| border-color: #4f46e5 !important; | |
| outline: none !important; | |
| box-shadow: 0 0 0 3px rgba(79,70,229,0.12) !important; | |
| } | |
| /* Accordion — Gradio 6 has no .accordion class; the only signal is a .block | |
| that *contains* a button.label-wrap. Use :has() to match precisely. */ | |
| .block.padded:has(> button.label-wrap) { | |
| background: #ffffff !important; | |
| border: 1px solid #e5e7eb !important; | |
| border-radius: 10px !important; | |
| margin: 14px 0 !important; | |
| padding: 0 !important; | |
| overflow: hidden !important; | |
| } | |
| .dark .block.padded:has(> button.label-wrap) { | |
| background: #111827 !important; | |
| border-color: #374151 !important; | |
| } | |
| button.label-wrap { | |
| background: #f8fafc !important; | |
| padding: 14px 18px !important; | |
| font-weight: 600 !important; | |
| font-size: 14px !important; | |
| color: #1f2937 !important; | |
| width: 100% !important; | |
| text-align: left !important; | |
| cursor: pointer !important; | |
| border: none !important; | |
| border-bottom: 1px solid #e5e7eb !important; | |
| display: flex !important; | |
| justify-content: space-between !important; | |
| align-items: center !important; | |
| letter-spacing: 0.01em; | |
| } | |
| .dark button.label-wrap { | |
| background: #1e293b !important; | |
| color: #f1f5f9 !important; | |
| border-bottom-color: #374151 !important; | |
| } | |
| button.label-wrap:hover { background: #f1f5f9 !important; } | |
| .dark button.label-wrap:hover { background: #334155 !important; } | |
| /* Sibling content of the header (the body when expanded) */ | |
| .block.padded:has(> button.label-wrap) > *:not(button.label-wrap) { | |
| padding: 16px 18px !important; | |
| background: #ffffff !important; | |
| } | |
| .dark .block.padded:has(> button.label-wrap) > *:not(button.label-wrap) { | |
| background: #111827 !important; | |
| } | |
| /* gr.Examples table — the default Gradio render is a raw HTML table with black | |
| borders and no hover state. Style it to match the rest of the page. */ | |
| .gradio-dataset, | |
| [data-testid="dataset"] { | |
| margin-top: 24px !important; | |
| background: transparent !important; | |
| border: none !important; | |
| } | |
| .gradio-dataset table, | |
| [data-testid="dataset"] table { | |
| border-collapse: collapse !important; | |
| border: 1px solid #e5e7eb !important; | |
| border-radius: 8px !important; | |
| overflow: hidden !important; | |
| font-size: 13px !important; | |
| width: 100% !important; | |
| } | |
| .dark .gradio-dataset table, | |
| .dark [data-testid="dataset"] table { border-color: #374151 !important; } | |
| .gradio-dataset thead, | |
| [data-testid="dataset"] thead { background: #f9fafb !important; } | |
| .dark .gradio-dataset thead, | |
| .dark [data-testid="dataset"] thead { background: #111827 !important; } | |
| .gradio-dataset th, | |
| [data-testid="dataset"] th { | |
| font-size: 11px !important; | |
| font-weight: 600 !important; | |
| text-transform: uppercase !important; | |
| letter-spacing: 0.05em !important; | |
| color: #6b7280 !important; | |
| text-align: left !important; | |
| padding: 10px 12px !important; | |
| border: none !important; | |
| border-bottom: 1px solid #e5e7eb !important; | |
| } | |
| .gradio-dataset td, | |
| [data-testid="dataset"] td { | |
| padding: 9px 12px !important; | |
| border: none !important; | |
| border-bottom: 1px solid #f3f4f6 !important; | |
| color: #1f2937 !important; | |
| font-size: 13px !important; | |
| background: transparent !important; | |
| cursor: pointer !important; | |
| } | |
| .dark .gradio-dataset td, | |
| .dark [data-testid="dataset"] td { | |
| color: #e5e7eb !important; | |
| border-bottom-color: #1f2937 !important; | |
| } | |
| .gradio-dataset tbody tr:last-child td, | |
| [data-testid="dataset"] tbody tr:last-child td { border-bottom: none !important; } | |
| .gradio-dataset tbody tr:hover, | |
| [data-testid="dataset"] tbody tr:hover { background: rgba(79, 70, 229, 0.04) !important; } | |
| .dark .gradio-dataset tbody tr:hover, | |
| .dark [data-testid="dataset"] tbody tr:hover { background: rgba(129, 140, 248, 0.08) !important; } | |
| /* Examples header label — Gradio puts a "Try one of these" label above */ | |
| .gradio-dataset > .label, | |
| [data-testid="dataset"] > .label, | |
| .gradio-dataset .block-label, | |
| .dataset .block-label { | |
| font-size: 11px !important; | |
| font-weight: 600 !important; | |
| text-transform: uppercase !important; | |
| letter-spacing: 0.06em !important; | |
| color: #6b7280 !important; | |
| background: transparent !important; | |
| border: none !important; | |
| padding: 0 0 6px 0 !important; | |
| margin-bottom: 0 !important; | |
| } | |
| /* Footer link strip */ | |
| .lc-footer { | |
| margin-top: 28px; | |
| padding: 14px 0; | |
| border-top: 1px solid #e5e7eb; | |
| font-size: 13px !important; | |
| color: #6b7280 !important; | |
| } | |
| .dark .lc-footer { border-top-color: #374151; } | |
| .lc-footer a { color: #4f46e5 !important; text-decoration: none; } | |
| .lc-footer a:hover { text-decoration: underline; } | |
| .dark .lc-footer a { color: #818cf8 !important; } | |
| /* Result wrapper */ | |
| .lc-result { | |
| padding: 4px 0; | |
| font-size: 14px; | |
| line-height: 1.55; | |
| color: #111827 !important; | |
| } | |
| .dark .lc-result { color: #f3f4f6 !important; } | |
| /* Headline */ | |
| .lc-header { padding: 4px 0 14px 0; border-bottom: 1px solid #e5e7eb; } | |
| .dark .lc-header { border-bottom-color: #374151; } | |
| .lc-title { | |
| font-size: 22px !important; | |
| font-weight: 700 !important; | |
| letter-spacing: -0.01em; | |
| color: #0f172a !important; | |
| } | |
| .dark .lc-title { color: #f8fafc !important; } | |
| .lc-subtitle { | |
| font-size: 13px !important; | |
| color: #6b7280 !important; | |
| margin-top: 2px; | |
| } | |
| /* Headline stat cards */ | |
| .lc-stats { | |
| display: grid; | |
| grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); | |
| gap: 12px; | |
| margin: 16px 0 8px 0; | |
| } | |
| .lc-stat { | |
| border: 1px solid #e5e7eb; | |
| border-radius: 10px; | |
| padding: 14px 16px; | |
| background: #ffffff; | |
| } | |
| .dark .lc-stat { background: #111827; border-color: #374151; } | |
| .lc-stat-value { | |
| font-size: 24px !important; | |
| font-weight: 700 !important; | |
| letter-spacing: -0.01em; | |
| line-height: 1.2; | |
| color: #0f172a !important; | |
| } | |
| .dark .lc-stat-value { color: #f8fafc !important; } | |
| .lc-stat-label { | |
| font-size: 11px !important; | |
| text-transform: uppercase; | |
| letter-spacing: 0.05em; | |
| color: #6b7280 !important; | |
| margin-top: 4px; | |
| font-weight: 500 !important; | |
| } | |
| .lc-stat-sub { | |
| font-size: 11px !important; | |
| color: #9ca3af !important; | |
| margin-top: 2px; | |
| } | |
| .lc-stat-chip { margin-top: 10px; } | |
| .lc-chip { | |
| display: inline-block; | |
| padding: 2px 8px; | |
| border-radius: 999px; | |
| font-size: 11px !important; | |
| font-weight: 600 !important; | |
| letter-spacing: 0.02em; | |
| } | |
| .lc-prov { | |
| margin-top: 6px; | |
| font-size: 12px !important; | |
| color: #6b7280 !important; | |
| font-style: italic; | |
| } | |
| /* Sections */ | |
| .lc-section { margin: 24px 0 0 0; } | |
| .lc-section h3 { | |
| font-size: 13px !important; | |
| font-weight: 600 !important; | |
| text-transform: uppercase; | |
| letter-spacing: 0.06em; | |
| color: #6b7280 !important; | |
| margin: 0 0 6px 0 !important; | |
| } | |
| .lc-section-help { | |
| font-size: 12px !important; | |
| color: #6b7280 !important; | |
| margin: 0 0 10px 0; | |
| line-height: 1.5; | |
| } | |
| /* Tables */ | |
| .lc-table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| font-size: 13px !important; | |
| color: #111827 !important; | |
| } | |
| .dark .lc-table { color: #f3f4f6 !important; } | |
| .lc-table th, .lc-table td { | |
| padding: 8px 10px; | |
| border-bottom: 1px solid #f3f4f6; | |
| text-align: left; | |
| } | |
| .dark .lc-table th, .dark .lc-table td { border-bottom-color: #1f2937; } | |
| .lc-table th { | |
| font-size: 11px !important; | |
| text-transform: uppercase; | |
| letter-spacing: 0.04em; | |
| color: #6b7280 !important; | |
| font-weight: 500 !important; | |
| } | |
| .lc-table-recon td:nth-child(2), | |
| .lc-table-recon td:nth-child(3) { text-align: right; } | |
| .lc-best { background: rgba(22, 163, 74, 0.08); } | |
| .dark .lc-best { background: rgba(22, 163, 74, 0.18); } | |
| /* Performance grid */ | |
| .lc-perf { | |
| display: grid; | |
| grid-template-columns: repeat(auto-fit, minmax(170px, 1fr)); | |
| gap: 12px; | |
| } | |
| .lc-perf-item { | |
| border: 1px solid #e5e7eb; | |
| border-radius: 10px; | |
| padding: 12px 14px; | |
| background: #ffffff; | |
| } | |
| .dark .lc-perf-item { border-color: #374151; background: #111827; } | |
| .lc-perf-value { | |
| font-size: 20px !important; | |
| font-weight: 700 !important; | |
| letter-spacing: -0.01em; | |
| color: #0f172a !important; | |
| line-height: 1.2; | |
| } | |
| .dark .lc-perf-value { color: #f8fafc !important; } | |
| .lc-perf-value code { | |
| font-size: 16px !important; | |
| font-weight: 600 !important; | |
| background: transparent !important; | |
| color: #0f172a !important; | |
| padding: 0 !important; | |
| } | |
| .dark .lc-perf-value code { color: #f8fafc !important; } | |
| .lc-perf-label { | |
| font-size: 11px !important; | |
| text-transform: uppercase; | |
| letter-spacing: 0.05em; | |
| color: #6b7280 !important; | |
| margin-top: 4px; | |
| font-weight: 500 !important; | |
| } | |
| .lc-perf-sub { | |
| font-size: 11px !important; | |
| color: #9ca3af !important; | |
| margin-top: 1px; | |
| } | |
| /* Inline code */ | |
| .lc-result code { | |
| font-family: "SF Mono", "JetBrains Mono", Menlo, Consolas, monospace !important; | |
| font-size: 0.92em !important; | |
| color: #0f172a !important; | |
| background: rgba(15, 23, 42, 0.06); | |
| padding: 1px 5px; | |
| border-radius: 4px; | |
| } | |
| .dark .lc-result code { | |
| color: #e2e8f0 !important; | |
| background: rgba(226, 232, 240, 0.08); | |
| } | |
| /* Generated command — ALWAYS dark theme regardless of mode */ | |
| .lc-cmd { | |
| background: #0b1220 !important; | |
| color: #f1f5f9 !important; | |
| padding: 16px 18px !important; | |
| border-radius: 8px; | |
| font-size: 12.5px !important; | |
| overflow-x: auto; | |
| white-space: pre; | |
| border: 1px solid #1e293b !important; | |
| margin: 0 !important; | |
| } | |
| .lc-cmd code { | |
| font-family: "SF Mono", "JetBrains Mono", Menlo, Consolas, monospace !important; | |
| background: transparent !important; | |
| color: #f1f5f9 !important; | |
| padding: 0 !important; | |
| font-size: 12.5px !important; | |
| border-radius: 0 !important; | |
| } | |
| /* Comparison view — side-by-side metrics across GPUs */ | |
| .lc-cmp-wrap { | |
| overflow-x: auto; | |
| margin: 8px 0 12px 0; | |
| border: 1px solid #e5e7eb; | |
| border-radius: 10px; | |
| background: #ffffff; | |
| } | |
| .dark .lc-cmp-wrap { background: #111827; border-color: #374151; } | |
| .lc-cmp-table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| font-size: 13px !important; | |
| } | |
| .lc-cmp-table th, | |
| .lc-cmp-table td { | |
| padding: 10px 12px; | |
| text-align: left; | |
| border-bottom: 1px solid #f3f4f6; | |
| } | |
| .dark .lc-cmp-table th, | |
| .dark .lc-cmp-table td { border-bottom-color: #1f2937; } | |
| .lc-cmp-table thead th { | |
| font-size: 11px !important; | |
| text-transform: uppercase; | |
| letter-spacing: 0.05em; | |
| color: #6b7280 !important; | |
| font-weight: 600 !important; | |
| background: #f9fafb; | |
| } | |
| .dark .lc-cmp-table thead th { background: #1e293b; color: #9ca3af !important; } | |
| .lc-cmp-row-label { | |
| font-size: 12px !important; | |
| color: #6b7280 !important; | |
| font-weight: 600 !important; | |
| white-space: nowrap; | |
| } | |
| .lc-cmp-row-info { | |
| font-style: italic; | |
| color: #9ca3af !important; | |
| } | |
| .dark .lc-cmp-row-info { color: #6b7280 !important; } | |
| .lc-cmp-tr-info td { | |
| color: #6b7280; | |
| background: #fafafa; | |
| } | |
| .dark .lc-cmp-tr-info td { color: #9ca3af; background: #0f172a; } | |
| .lc-cmp-gpu { | |
| font-family: "SF Mono", "JetBrains Mono", Menlo, monospace !important; | |
| font-size: 12px !important; | |
| } | |
| .lc-cmp-table tbody tr:last-child td { border-bottom: none; } | |
| .lc-cmp-winner { | |
| background: rgba(22, 163, 74, 0.10) !important; | |
| font-weight: 700 !important; | |
| color: #15803d !important; | |
| position: relative; | |
| } | |
| .dark .lc-cmp-winner { background: rgba(74, 222, 128, 0.15) !important; color: #4ade80 !important; } | |
| .lc-cmp-winner::before { | |
| content: "✓ "; | |
| font-size: 11px; | |
| font-weight: 700; | |
| color: #15803d; | |
| margin-right: 2px; | |
| } | |
| .dark .lc-cmp-winner::before { color: #4ade80; } | |
| .lc-cmp-summary { | |
| margin-top: 12px; | |
| padding: 12px 14px; | |
| border-radius: 8px; | |
| background: #eef2ff; | |
| border: 1px solid #c7d2fe; | |
| font-size: 13px !important; | |
| color: #312e81 !important; | |
| } | |
| .dark .lc-cmp-summary { | |
| background: #1e1b4b; | |
| border-color: #3730a3; | |
| color: #e0e7ff !important; | |
| } | |
| .lc-cmp-summary strong { color: #4338ca; } | |
| .dark .lc-cmp-summary strong { color: #a5b4fc; } | |
| /* Per-GPU detail cards under the table */ | |
| .lc-cmp-details { | |
| display: grid; | |
| grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); | |
| gap: 12px; | |
| } | |
| .lc-cmp-detail { | |
| border: 1px solid #e5e7eb; | |
| border-radius: 10px; | |
| padding: 12px 14px; | |
| background: #ffffff; | |
| } | |
| .dark .lc-cmp-detail { background: #111827; border-color: #374151; } | |
| .lc-cmp-detail-gpu { | |
| font-family: "SF Mono", "JetBrains Mono", Menlo, monospace !important; | |
| font-size: 13px !important; | |
| font-weight: 700 !important; | |
| color: #0f172a !important; | |
| margin-bottom: 6px; | |
| padding-bottom: 6px; | |
| border-bottom: 1px solid #e5e7eb; | |
| } | |
| .dark .lc-cmp-detail-gpu { color: #f8fafc !important; border-bottom-color: #374151; } | |
| .lc-cmp-detail-row { | |
| display: flex; | |
| justify-content: space-between; | |
| font-size: 12px !important; | |
| padding: 3px 0; | |
| } | |
| .lc-cmp-detail-row span { color: #6b7280 !important; } | |
| .lc-cmp-detail-row strong { | |
| color: #0f172a !important; | |
| font-size: 13px !important; | |
| } | |
| .dark .lc-cmp-detail-row strong { color: #f8fafc !important; } | |
| /* Star-on-GitHub CTA — shown at the bottom of the result, capturing the | |
| peak-satisfaction moment. Card-style with indigo accent so it reads as | |
| "thanks", not as a banner ad. */ | |
| .lc-star-cta { | |
| display: flex; | |
| align-items: center; | |
| gap: 14px; | |
| margin: 28px 0 8px 0; | |
| padding: 14px 18px; | |
| border: 1px solid #c7d2fe; | |
| background: #eef2ff; | |
| border-radius: 10px; | |
| text-decoration: none !important; | |
| color: #312e81 !important; | |
| transition: background 0.15s ease, border-color 0.15s ease, transform 0.1s ease; | |
| } | |
| .lc-star-cta:hover { | |
| background: #e0e7ff; | |
| border-color: #a5b4fc; | |
| } | |
| .lc-star-cta:active { transform: scale(0.995); } | |
| .dark .lc-star-cta { | |
| background: #1e1b4b; | |
| border-color: #3730a3; | |
| color: #c7d2fe !important; | |
| } | |
| .dark .lc-star-cta:hover { background: #312e81; } | |
| .lc-star-cta svg { flex: 0 0 auto; color: #4338ca; } | |
| .dark .lc-star-cta svg { color: #a5b4fc; } | |
| .lc-star-cta-text { flex: 1 1 auto; min-width: 0; } | |
| .lc-star-cta-q { | |
| font-size: 14px !important; | |
| font-weight: 600 !important; | |
| line-height: 1.3; | |
| color: #312e81 !important; | |
| } | |
| .dark .lc-star-cta-q { color: #e0e7ff !important; } | |
| .lc-star-cta-q-en { | |
| font-size: 12px !important; | |
| color: #6366f1 !important; | |
| margin-top: 2px; | |
| line-height: 1.3; | |
| } | |
| .dark .lc-star-cta-q-en { color: #a5b4fc !important; } | |
| .lc-star-cta-action { | |
| flex: 0 0 auto; | |
| font-size: 13px !important; | |
| font-weight: 700 !important; | |
| color: #4338ca !important; | |
| white-space: nowrap; | |
| } | |
| .dark .lc-star-cta-action { color: #c7d2fe !important; } | |
| @media (max-width: 540px) { | |
| .lc-star-cta { flex-wrap: wrap; gap: 10px; } | |
| .lc-star-cta-action { flex-basis: 100%; } | |
| } | |
| /* Loading + error */ | |
| .lc-loading { | |
| display: flex; | |
| align-items: center; | |
| gap: 14px; | |
| padding: 24px; | |
| color: #6b7280 !important; | |
| font-size: 14px !important; | |
| } | |
| .lc-spinner { | |
| width: 18px; height: 18px; | |
| border: 2px solid #cbd5e1; | |
| border-top-color: #4f46e5; | |
| border-radius: 50%; | |
| animation: lc-spin 0.7s linear infinite; | |
| flex: none; | |
| } | |
| @keyframes lc-spin { to { transform: rotate(360deg); } } | |
| .lc-error pre { | |
| background: #fef2f2; | |
| color: #991b1b !important; | |
| padding: 12px 14px; | |
| border-radius: 8px; | |
| border: 1px solid #fecaca; | |
| font-size: 12px !important; | |
| white-space: pre-wrap; | |
| word-break: break-word; | |
| margin: 0; | |
| } | |
| .dark .lc-error pre { background: #450a0a; color: #fca5a5 !important; border-color: #7f1d1d; } | |
| /* Explain trace */ | |
| .lc-explain-entry { | |
| margin: 14px 0; | |
| padding: 14px 16px; | |
| border: 1px solid #e5e7eb; | |
| border-left: 3px solid #4f46e5; | |
| border-radius: 8px; | |
| background: #fafafa; | |
| } | |
| .dark .lc-explain-entry { background: #0f172a; border-color: #374151; border-left-color: #818cf8; } | |
| .lc-explain-heading { | |
| font-weight: 700 !important; | |
| font-size: 14px !important; | |
| margin-bottom: 8px; | |
| color: #0f172a !important; | |
| } | |
| .dark .lc-explain-heading { color: #f8fafc !important; } | |
| .lc-explain-formula { | |
| margin: 6px 0; | |
| font-size: 12.5px !important; | |
| } | |
| .lc-explain-formula code { | |
| background: rgba(79, 70, 229, 0.08) !important; | |
| color: #4338ca !important; | |
| padding: 4px 8px !important; | |
| border-radius: 4px; | |
| } | |
| .dark .lc-explain-formula code { color: #a5b4fc !important; background: rgba(165, 180, 252, 0.12) !important; } | |
| .lc-explain-inputs, .lc-explain-steps { | |
| margin: 6px 0 6px 1.2em; | |
| font-size: 12.5px !important; | |
| line-height: 1.7; | |
| } | |
| .lc-explain-label { | |
| font-size: 11px !important; | |
| color: #6b7280 !important; | |
| font-style: italic; | |
| } | |
| .lc-explain-result { | |
| margin-top: 8px; | |
| padding-top: 8px; | |
| border-top: 1px dashed #e5e7eb; | |
| font-size: 13px !important; | |
| color: #0f172a !important; | |
| } | |
| .dark .lc-explain-result { color: #f8fafc !important; border-top-color: #374151; } | |
| /* LLM review */ | |
| .lc-llm-banner { | |
| display: flex; | |
| align-items: center; | |
| gap: 8px; | |
| padding: 8px 12px; | |
| background: #f9fafb; | |
| border: 1px solid #e5e7eb; | |
| border-radius: 8px; | |
| font-size: 12px !important; | |
| color: #4b5563 !important; | |
| margin-bottom: 12px; | |
| } | |
| .dark .lc-llm-banner { color: #d1d5db !important; background: #111827; border-color: #374151; } | |
| .lc-llm-model { | |
| font-size: 11px !important; | |
| color: #6b7280 !important; | |
| font-weight: 500 !important; | |
| margin-left: 6px; | |
| text-transform: none !important; | |
| letter-spacing: 0 !important; | |
| } | |
| .lc-llm-content { | |
| font-size: 13px !important; | |
| line-height: 1.7; | |
| color: #0f172a !important; | |
| padding: 12px 14px; | |
| border: 1px solid #e5e7eb; | |
| border-radius: 8px; | |
| background: #ffffff; | |
| } | |
| .dark .lc-llm-content { color: #f3f4f6 !important; background: #111827; border-color: #374151; } | |
| """ | |
| def _build_ui() -> gr.Blocks: | |
| with gr.Blocks(title="llm-cal — LLM hardware calculator") as demo: | |
| gr.HTML(HERO_HTML) | |
| # ---- Required ---------------------------------------------------- | |
| with gr.Row(): | |
| model_id = gr.Textbox( | |
| label="Model ID · 模型 ID", | |
| placeholder="e.g. deepseek-ai/DeepSeek-V4-Flash", | |
| info="Repo id · 仓库 ID(owner/name)", | |
| scale=3, | |
| ) | |
| source = gr.Radio( | |
| choices=["HuggingFace", "ModelScope"], | |
| value="HuggingFace", | |
| label="Source · 来源", | |
| info="Where to pull model metadata · 拉取来源", | |
| scale=2, | |
| ) | |
| with gr.Row(): | |
| vendor = gr.Dropdown( | |
| choices=VENDOR_CHOICES_EN, | |
| value=DEFAULT_VENDOR, | |
| label="GPU vendor · GPU 厂商", | |
| info="11 vendors covered · 共 11 家", | |
| scale=1, | |
| ) | |
| gpu = gr.Dropdown( | |
| choices=_VENDOR_TO_GPUS[DEFAULT_VENDOR], | |
| value=[DEFAULT_GPU], | |
| label="GPU model · GPU 型号", | |
| info="One GPU = single eval. 2-4 = compare side-by-side · 选 1 张单评估,2-4 张对比", | |
| scale=2, | |
| multiselect=True, | |
| max_choices=4, | |
| allow_custom_value=True, | |
| ) | |
| with gr.Row(): | |
| engine = gr.Radio( | |
| choices=["vllm", "sglang"], | |
| value="vllm", | |
| label="Engine · 引擎", | |
| info="Inference engine · 推理引擎", | |
| ) | |
| context_length = gr.Number( | |
| label="Context length · Context 长度", | |
| value=None, | |
| precision=0, | |
| info="Empty = 4K/32K/128K/1M · 留空显示全档", | |
| ) | |
| lang = gr.Radio( | |
| choices=["English", "中文"], | |
| value="English", | |
| label="Output language · 输出语言", | |
| info="Result area only · 仅影响下方结果区", | |
| ) | |
| # ---- Performance tuning (collapsible) ---------------------------- | |
| with gr.Accordion("Performance tuning · 性能参数", open=False): | |
| with gr.Row(): | |
| input_tokens = gr.Number( | |
| label="Input tokens · 输入 tokens", | |
| value=2000, | |
| precision=0, | |
| info="Prefill budget · Prefill 预算", | |
| ) | |
| output_tokens = gr.Number( | |
| label="Output tokens · 输出 tokens", | |
| value=512, | |
| precision=0, | |
| info="Decode budget · Decode 预算", | |
| ) | |
| target_tps = gr.Number( | |
| label="Target tok/s/user · 单用户目标 tok/s", | |
| value=30.0, | |
| info="SLA per user · 单用户 SLA(30 ≈ 流畅阅读)", | |
| ) | |
| with gr.Row(): | |
| prefill_util = gr.Number( | |
| label="Prefill util · Prefill 利用率", | |
| value=0.40, | |
| info="0–1 · 0.40 = vLLM paper baseline", | |
| ) | |
| decode_bw_util = gr.Number( | |
| label="Decode BW util · Decode 带宽利用率", | |
| value=0.50, | |
| info="0–1 · 0.50 = community median", | |
| ) | |
| concurrency_degradation = gr.Number( | |
| label="Concurrency degradation · 并发衰减", | |
| value=1.0, | |
| info="1.0 = honest · 1.67 = 60% efficiency under load", | |
| ) | |
| # ---- Advanced (collapsible) -------------------------------------- | |
| with gr.Accordion("Advanced · 高级", open=False): | |
| with gr.Row(): | |
| hf_token = gr.Textbox( | |
| label="HF_TOKEN", | |
| value="", | |
| placeholder="hf_...", | |
| type="password", | |
| info="For gated HF models · 私有 HF 模型用", | |
| ) | |
| ms_token = gr.Textbox( | |
| label="MODELSCOPE_API_TOKEN", | |
| value="", | |
| placeholder="ms-...", | |
| type="password", | |
| info="For gated MS models · 私有 MS 模型用", | |
| ) | |
| with gr.Row(): | |
| gpu_count = gr.Number( | |
| label="Force GPU count · 强制 GPU 数", | |
| value=None, | |
| precision=0, | |
| info="Empty = auto min/dev/prod · 留空自动给三档", | |
| ) | |
| refresh = gr.Checkbox( | |
| label="Refresh cache · 刷新缓存", | |
| value=False, | |
| info="Bypass diskcache · 跳过本地缓存", | |
| ) | |
| with gr.Row(): | |
| explain = gr.Checkbox( | |
| label="--explain · 推导链", | |
| value=False, | |
| info="Full derivation trace · 输出完整推导链", | |
| ) | |
| llm_review = gr.Checkbox( | |
| label="--llm-review · LLM 审计", | |
| value=False, | |
| info="Second opinion from an LLM · 第二意见审计", | |
| ) | |
| with gr.Row(): | |
| llm_api_key = gr.Textbox( | |
| label="LLM API key · LLM API 密钥", | |
| value="", | |
| placeholder="sk-...", | |
| type="password", | |
| info="OpenAI-compatible endpoint · OpenAI 兼容端点", | |
| ) | |
| llm_base_url = gr.Textbox( | |
| label="LLM base URL · LLM 基地址", | |
| value="", | |
| placeholder="https://api.openai.com/v1", | |
| info="e.g. https://api.deepseek.com/v1", | |
| ) | |
| llm_model = gr.Textbox( | |
| label="LLM model · LLM 模型名", | |
| value="", | |
| placeholder="gpt-4o", | |
| info="e.g. gpt-4o / deepseek-chat / MiniMax-M2", | |
| ) | |
| with gr.Row(elem_classes="lc-submit-wrap"): | |
| submit = gr.Button("Calculate · 计算", variant="primary", size="lg") | |
| # Three output panes — main always shows, explain/llm-review only when toggled | |
| output_main = gr.HTML(label="Result") | |
| output_explain = gr.HTML(label="Explain trace") | |
| output_llm = gr.HTML(label="LLM review") | |
| gr.Examples( | |
| examples=[ | |
| # gpu wrapped in a list — the Dropdown is multiselect now | |
| [m, v, [g], e, None, "English", s] | |
| for m, v, g, e, s in EXAMPLE_MODELS | |
| ], | |
| inputs=[model_id, vendor, gpu, engine, context_length, lang, source], | |
| label="Try one of these · 试试这些组合", | |
| ) | |
| gr.HTML( | |
| "<div class='lc-footer'>" | |
| "<a href='https://github.com/FlyTOmeLight/llm-cal' target='_blank'>GitHub</a> · " | |
| "<a href='https://flytomelight.github.io/llm-cal/' target='_blank'>Docs</a> · " | |
| "<a href='https://flytomelight.github.io/llm-cal/methodology/' target='_blank'>Methodology</a> · " | |
| "<code>pip install llm-cal</code>" | |
| "</div>" | |
| ) | |
| # When vendor changes, repopulate the GPU dropdown but PRESERVE any | |
| # cross-vendor selections (the whole point of compare mode is to | |
| # stack e.g. H800 + MI300X + 910B4 across NVIDIA/AMD/Ascend). | |
| def _on_vendor_change(v: str, current): # noqa: ANN001, ANN202 | |
| gpus = _VENDOR_TO_GPUS.get(v, []) | |
| # multiselect returns list; harden against str/None for safety | |
| if isinstance(current, list): | |
| keep = list(current) | |
| elif current: | |
| keep = [current] | |
| else: | |
| keep = [] | |
| # Empty selection? Seed with the first GPU so the form stays usable. | |
| if not keep: | |
| keep = [gpus[0]] if gpus else [] | |
| return gr.Dropdown(choices=gpus, value=keep) | |
| vendor.change(fn=_on_vendor_change, inputs=[vendor, gpu], outputs=[gpu]) | |
| # Click flow: instantly show "loading…", THEN run calculate. | |
| all_outputs = [output_main, output_explain, output_llm] | |
| submit.click( | |
| fn=show_loading, | |
| inputs=[lang], | |
| outputs=all_outputs, | |
| ).then( | |
| fn=calculate, | |
| inputs=[ | |
| model_id, gpu, engine, context_length, lang, source, | |
| gpu_count, input_tokens, output_tokens, target_tps, | |
| prefill_util, decode_bw_util, concurrency_degradation, | |
| refresh, explain, llm_review, | |
| hf_token, ms_token, | |
| llm_api_key, llm_base_url, llm_model, | |
| ], | |
| outputs=all_outputs, | |
| ) | |
| return demo | |
| def _prewarm_cache() -> None: | |
| """Fill the artifact cache for every Examples row so first-click users | |
| don't pay the 3-8s HF/MS metadata roundtrip. | |
| Runs on a daemon thread alongside the Gradio server. Failures are | |
| swallowed (printed only) — pre-warm is a UX nicety, never a hard | |
| dependency. Set LLM_CAL_PREWARM=0 to disable (useful for local dev | |
| when you don't want 9 API calls every time you `python web/app.py`). | |
| """ | |
| import time | |
| print(f"[prewarm] starting cache warm-up for {len(EXAMPLE_MODELS)} examples") | |
| for i, (model_id, _vendor, gpu, engine, source) in enumerate(EXAMPLE_MODELS, 1): | |
| src_key = "modelscope" if "modelscope" in source.lower() else "huggingface" | |
| label = f"{i}/{len(EXAMPLE_MODELS)} {src_key}:{model_id}" | |
| try: | |
| t0 = time.monotonic() | |
| _get_evaluator(src_key).evaluate( | |
| model_id=model_id, | |
| gpu=gpu, | |
| engine=engine, | |
| ) | |
| print(f"[prewarm] {label} ok ({time.monotonic() - t0:.1f}s)") | |
| except Exception as e: # noqa: BLE001 | |
| print(f"[prewarm] {label} skip — {type(e).__name__}: {e}") | |
| # Throttle to stay well under HF/MS anonymous rate limits. | |
| time.sleep(2) | |
| print("[prewarm] done") | |
| if __name__ == "__main__": | |
| if os.environ.get("LLM_CAL_PREWARM", "1") == "1": | |
| import threading | |
| threading.Thread(target=_prewarm_cache, daemon=True).start() | |
| _build_ui().launch(theme=THEME, css=CUSTOM_CSS) | |