"
f"{_stat_card('Weight' if not is_zh else '权重', weight_str, sublabel='from safetensors API' if not is_zh else '取自 safetensors API', chip=weight_chip)}"
f"{_stat_card('Quantization' if not is_zh else '量化', _esc(w.quantization_guess.value), sublabel='resolved scheme' if not is_zh else '已识别方案', chip=quant_chip)}"
f"{_stat_card('Prod GPUs' if not is_zh else 'Prod GPU 数', prod_gpus, sublabel='for 16-user prod' if not is_zh else '生产档(16 路并发)')}"
f"{_stat_card('Users @ 128K' if not is_zh else '用户 @ 128K', prod_concurrent, sublabel='concurrent at prod tier' if not is_zh else '生产档的并发')}"
f"
"
)
# Provenance footer for the headline
quant_source = _esc(w.quantization_guess.source or "")
headline += f"
"
for k, v in arch_rows
)
arch_explainer = (
"从模型 config.json 读出来的,决定后续所有公式怎么走(是否分组注意力、是否 MoE、是否滑动窗口)。"
if is_zh
else "Read straight from the model's config.json. Drives every formula "
"downstream — attention sharding, MoE active-expert ratio, sliding window."
)
arch_section = (
f"
{'架构' if is_zh else 'Architecture'}
"
f"
{arch_explainer}
"
f"
{arch_html}
"
)
# ---- Reconciliation ------------------------------------------------------
recon_rows = []
for c in r.candidates[:5]:
is_best = c.scheme == r.best.value
cls = " class='lc-best'" if is_best else ""
marker = " ✓" if is_best else ""
recon_rows.append(
f"
{_esc(c.scheme)}{marker}
"
f"
{_fmt_bytes(c.predicted_bytes)}
"
f"
{c.relative_error * 100:.1f}%
"
)
recon_explainer = (
"用每种量化方案预测应该有多少字节,跟实际 safetensors 字节对比。误差最小的胜出。"
"FP4_FP8_MIXED / GPTQ_INT4 / AWQ_INT4 在 0.55 bpp 处会打平,需要 config 或 dtype 进一步区分。"
if is_zh
else "Predict bytes under each quantization hypothesis, compare against the real "
"safetensors size. Lowest error wins. FP4_FP8_MIXED / GPTQ_INT4 / AWQ_INT4 tie "
"at 0.55 bpp — broken via config.json or per-tensor dtype."
)
recon_section = (
f"
"
f"
{'量化反演' if is_zh else 'Quantization reconciliation'}
"
f"
{recon_explainer}
"
f"
"
f"
Scheme
"
f"
{'预测字节' if is_zh else 'Predicted'}
"
f"
{'误差' if is_zh else 'Error'}
"
f"{''.join(recon_rows)}
"
)
# ---- Fleet ---------------------------------------------------------------
fleet_section = ""
if f and f.options:
# Pick which context lengths get their own concurrency column.
# Always include 128K if any option has it; also include the model max
# if it's larger (e.g. 1M for DeepSeek-V4-Flash) so the user can compare
# "fits 23 users at 128K but only 2 at 1M".
all_ctxs: set[int] = set()
for opt in f.options:
for ctx, _ in opt.max_concurrent_by_context:
all_ctxs.add(ctx)
ctx_cols: list[int] = []
if 131_072 in all_ctxs:
ctx_cols.append(131_072)
max_ctx = max(all_ctxs) if all_ctxs else 0
if max_ctx > 131_072 and max_ctx not in ctx_cols:
ctx_cols.append(max_ctx)
if not ctx_cols and all_ctxs:
ctx_cols.append(max_ctx)
def _ctx_label(ctx: int) -> str:
if ctx >= 1_000_000:
return f"{ctx // 1_000_000}M" if ctx % 1_000_000 == 0 else f"{ctx / 1_000_000:.1f}M"
if ctx >= 1024:
return f"{ctx // 1024}K"
return str(ctx)
rows = []
for opt in f.options:
star = " ★" if opt.tier == f.best_tier else ""
cls = " class='lc-best'" if opt.tier == f.best_tier else ""
headroom = max(0, opt.usable_bytes_per_gpu - opt.weight_bytes_per_gpu)
ctx_map = dict(opt.max_concurrent_by_context)
ctx_cells = "".join(f"
"
for c in ctx_cols
)
fleet_explainer = (
"min = 刚好放得下;dev = 8 路并发场景;prod = 16 路并发场景。★ = 推荐。"
if is_zh
else "min = barely fits weights; dev = sized for 8 concurrent at 128K; "
"prod = sized for 16 concurrent at 128K. ★ = recommended."
)
fleet_section = (
f"
"
f"
{'推荐集群' if is_zh else 'Recommended fleet'}
"
f"
{fleet_explainer}
"
f"
"
f"
Tier
GPUs
"
f"
Weight/GPU
Headroom/GPU
"
f"{ctx_headers}
"
f"{''.join(rows)}
"
)
# ---- Performance ---------------------------------------------------------
perf_explainer = (
"Prefill 用算力公式(FLOPs = 2 × 参数 × 输入 token),decode 用带宽公式(吞吐 = 带宽 × 利用率 / 权重字节)。"
"Bottleneck 标 memory_bandwidth 说明 decode 是带宽瓶颈,加显存带宽更高的 GPU 比加算力更划算。"
if is_zh
else "Prefill uses the compute formula (FLOPs = 2 × params × input_tokens, Kaplan 2020). "
"Decode uses memory-bandwidth formula (tps = BW × util / weight_bytes, vLLM paper). "
"Bottleneck = memory_bandwidth means a higher-BW GPU helps more than more FLOPS."
)
perf_section = ""
if report.prefill and report.decode and report.concurrency:
max_users = report.concurrency.max_concurrent.value
bn = report.concurrency.bottleneck
items = [
(
"Prefill latency" if not is_zh else "Prefill 延迟",
f"{report.prefill.latency_ms.value:.0f} ms",
f"@ {report.perf_input_tokens or 2000} input tokens",
),
(
"Cluster decode" if not is_zh else "集群 decode 吞吐",
f"{report.decode.cluster_tokens_per_sec.value:.0f} tok/s",
"",
),
(
"Max concurrent users" if not is_zh else "最大并发用户",
str(max_users),
"",
),
(
"Bottleneck" if not is_zh else "瓶颈",
f"{_esc(bn)}",
"",
),
]
items_html = "".join(
f"
"
f"
{v}
"
f"
{_esc(label)}
"
f"
{_esc(sub)}
"
for label, v, sub in items
)
perf_section = (
f"
"
f"
{'性能' if is_zh else 'Performance'}
"
f"
{perf_explainer}
"
f"
{items_html}
"
)
# ---- KV cache per request -----------------------------------------------
kv_section = ""
if report.kv_cache_by_context:
rows = []
for ctx, av in sorted(report.kv_cache_by_context.items()):
rows.append(
f"
{ctx:,}
{_fmt_bytes(av.value)}
"
f"
{_label_chip(av.label.value)}
"
)
kv_explainer = (
"单个请求在不同 context 长度下需要多少 KV 缓存。这是决定一张 GPU 能并发跑多少请求的关键。"
"MLA / MQA 模型这里会比标准 GQA 小很多。"
if is_zh
else "How much KV cache one request consumes at each context length. "
"This is what limits per-GPU concurrency. MLA / MQA models are "
"dramatically smaller here than standard GQA."
)
kv_section = (
f"
"
f"
{'KV 缓存(每请求)' if is_zh else 'KV cache per request'}
"
f"
{kv_explainer}
"
f"
"
f"
{'Context tokens' if not is_zh else 'Context 长度'}
"
f"
{'KV bytes' if not is_zh else 'KV 字节'}
"
f"
{'Label' if not is_zh else '标签'}
"
f"{''.join(rows)}
"
)
# ---- Engine compatibility -----------------------------------------------
engine_section = ""
em = report.engine_match
if em:
def _fmt_flag(f) -> str: # noqa: ANN001
base = f"{f.flag} {f.value}".strip()
return base
flags = ", ".join(_fmt_flag(f) for f in em.required_flags) if em.required_flags else "—"
opt_flags = ", ".join(_fmt_flag(f) for f in em.optional_flags) if em.optional_flags else "—"
caveats = em.caveats_zh if is_zh else em.caveats_en
sources_html = "—"
if em.sources:
sources_html = " ".join(
f'{_esc(s.url)}'
+ (
f" ({_esc(s.captured_date)})"
if s.captured_date
else ""
)
for s in em.sources
)
rows = [
(("引擎" if is_zh else "Engine"), f"{_esc(em.engine)}"),
(
("版本要求" if is_zh else "Version"),
f"{_esc(em.version_spec)}",
),
(
("支持级别" if is_zh else "Support"),
_label_chip(em.support) if em.support in {"verified", "cited", "unverified"} else f"{_esc(em.support)}",
),
(
("验证级别" if is_zh else "Verification"),
_label_chip(em.verification_level),
),
(("必需 flag" if is_zh else "Required flags"), f"{_esc(flags)}"),
(("可选 flag" if is_zh else "Optional flags"), f"{_esc(opt_flags)}"),
]
if caveats:
rows.append((("注意事项" if is_zh else "Caveats"), _esc(caveats)))
rows.append((("来源" if is_zh else "Sources"), sources_html))
body = "".join(f"
{k}
{v}
" for k, v in rows)
engine_explainer = (
"这个模型在 vLLM/SGLang 哪个版本起能跑、需要哪些必需 flag、有哪些优化 flag。"
"verification_level 标 cited 表示从 PR / release note 引用,verified 表示实测过。"
if is_zh
else "Which engine version supports this model, what flags are required, "
"and which optional flags help. verification_level=cited means we got it "
"from a PR or release note; verified means we actually ran it."
)
engine_section = (
f"
"
f"
{'引擎兼容性' if is_zh else 'Engine compatibility'}
"
f"
{engine_explainer}
"
f"
{body}
"
)
# ---- GPU spec ------------------------------------------------------------
gpu_section = ""
g = report.gpu_spec
if g:
notes = g.notes_zh if is_zh else g.notes_en
rows = [
("HBM", f"{g.memory_gb} GB"),
("Memory BW", f"{g.memory_bandwidth_gbps or '—'} GB/s"),
("NVLink BW", f"{g.nvlink_bandwidth_gbps} GB/s"),
("FP16 TFLOPS", f"{g.fp16_tflops}"),
("FP8", "✓" if g.fp8_support else "—"),
("FP4", "✓" if g.fp4_support else "—"),
]
rows_html = "".join(
f"
"
)
def _render_compare(reports: list[EvaluationReport], locale: str) -> str:
"""Side-by-side comparison of N >= 2 reports for the same model on
different GPUs.
Each metric column declares whether higher or lower is better and we
paint the winner cell in green so the eye snaps to it.
"""
set_locale(locale) # type: ignore[arg-type]
is_zh = locale == "zh"
# All reports share the same model_id + engine — pull from the first.
head = reports[0]
title = (
f"
"
)
# Metric definitions: (label_en, label_zh, value_fn, better=lower|higher|info, formatter)
# "info" rows are not contested — used for model-determined facts (same across
# GPUs by construction) or for descriptive cells like Bottleneck.
def _max_concurrent(r: EvaluationReport) -> int | None:
if not r.fleet:
return None
prod = next((o for o in r.fleet.options if o.tier == "prod"), None)
return prod.max_concurrent_at_reference_ctx if prod else None
def _prod_gpu_count(r: EvaluationReport) -> int | None:
if not r.fleet:
return None
prod = next((o for o in r.fleet.options if o.tier == "prod"), None)
return prod.gpu_count if prod else None
def _kv_per_user_128k(r: EvaluationReport) -> int | None:
av = r.kv_cache_by_context.get(131072)
return av.value if av is not None else None
def _native_precision_score(r: EvaluationReport) -> int | None:
g = r.gpu_spec
if g is None:
return None
return (1 if g.fp8_support else 0) + (1 if g.fp4_support else 0)
def _fmt_native(v: int | None) -> str:
if v is None:
return "—"
return {0: "FP16 only", 1: "FP8", 2: "FP8 + FP4"}.get(v, str(v))
def _max_context_tokens(r: EvaluationReport) -> int | None:
"""Effective max context the model claims to support.
In modern HF configs (LLaMA 3+, DeepSeek V3+, Qwen2.5+), the field
max_position_embeddings already reflects the post-RoPE/YaRN-scaling
window. rope_scaling_factor is recorded for provenance but must NOT
be multiplied in again — that double-counts.
"""
pos = r.profile.position
if pos is None or pos.max_position_embeddings is None:
return None
return int(pos.max_position_embeddings)
def _fmt_context(v: int | None) -> str:
"""Binary-base formatting so 131072 reads as '128K' not '131K'."""
if v is None:
return "—"
if v >= 1024 * 1024:
return f"{v / (1024 * 1024):.1f}M".replace(".0M", "M")
if v >= 1024:
return f"{v // 1024}K"
return str(v)
def _cluster_qps(r: EvaluationReport) -> float | None:
"""Steady-state queries/sec the cluster sustains:
QPS = cluster_decode_tokens_per_sec / output_tokens_per_request."""
if not r.decode or r.decode.cluster_tokens_per_sec.value <= 0:
return None
out = r.perf_output_tokens or 512
if out <= 0:
return None
return r.decode.cluster_tokens_per_sec.value / out
metrics = [
# ── Model-determined rows (info; identical across GPUs by definition) ──
("Quantization", "量化方案",
lambda r: r.weight.quantization_guess.value, "info",
lambda v: _esc(str(v)) if v else "—"),
("Weights total", "权重总量",
lambda r: r.weight.total_bytes.value, "info",
lambda v: _fmt_bytes(v) if v else "—"),
("KV / user @ 128K", "KV / 用户 @ 128K",
_kv_per_user_128k, "info",
lambda v: _fmt_bytes(v) if v is not None else "—"),
("Max context", "最大上下文",
_max_context_tokens, "info",
_fmt_context),
# ── GPU hardware specs (contested) ──
("HBM / card", "单卡显存",
lambda r: r.gpu_spec.memory_gb if r.gpu_spec else None, "higher",
lambda v: f"{v} GB" if v is not None else "—"),
("HBM bandwidth", "显存带宽",
lambda r: r.gpu_spec.memory_bandwidth_gbps if r.gpu_spec else None, "higher",
lambda v: f"{v:,} GB/s" if v is not None else "—"),
("NVLink / card", "NVLink 带宽",
lambda r: r.gpu_spec.nvlink_bandwidth_gbps if r.gpu_spec else None, "higher",
lambda v: (f"{v} GB/s" if v else "无") if v is not None else "—"),
("Native FP8/FP4", "原生低精度",
_native_precision_score, "higher",
_fmt_native),
# ── Sizing & performance outcomes (contested) ──
("Prod GPUs", "生产档 GPU 数",
_prod_gpu_count, "lower",
lambda v: str(v) if v is not None else "—"),
("Users @ 128K", "用户 @ 128K",
_max_concurrent, "higher",
lambda v: str(v) if v is not None else "—"),
("Prefill latency", "Prefill 延迟",
lambda r: r.prefill.latency_ms.value if r.prefill else None, "lower",
lambda v: f"{v:.0f} ms" if v is not None else "—"),
("Per-GPU decode", "单卡 decode 吞吐",
lambda r: r.decode.per_gpu_tokens_per_sec.value if r.decode else None, "higher",
lambda v: f"{v:.0f} tok/s" if v is not None else "—"),
("Cluster decode", "集群 decode 吞吐",
lambda r: r.decode.cluster_tokens_per_sec.value if r.decode else None, "higher",
lambda v: f"{v:.0f} tok/s" if v is not None else "—"),
("Sustained QPS", "稳态 QPS",
_cluster_qps, "higher",
lambda v: f"{v:.2f} q/s" if v is not None else "—"),
# ── Diagnostic (info — string, not a number race) ──
("Bottleneck", "瓶颈",
lambda r: r.concurrency.bottleneck if r.concurrency else None, "info",
lambda v: f"{_esc(str(v))}" if v else "—"),
]
# GPU column headers
gpu_headers = "".join(
f"
{_esc(r.gpu)}
" for r in reports
)
rows_html = []
for label_en, label_zh, getter, better, fmt in metrics:
values = [getter(r) for r in reports]
# Pick the winning index. None values are excluded from the contest.
winner_idx: int | None = None
if better in ("higher", "lower"):
numeric_pairs = [(i, v) for i, v in enumerate(values) if isinstance(v, (int, float))]
if numeric_pairs:
if better == "higher":
winner_idx = max(numeric_pairs, key=lambda p: p[1])[0]
else:
winner_idx = min(numeric_pairs, key=lambda p: p[1])[0]
# If all values are equal, no winner (avoid arbitrary-tiebreak gold star)
vals_set = {v for _, v in numeric_pairs}
if len(vals_set) <= 1:
winner_idx = None
cells = []
for i, v in enumerate(values):
cls = " class='lc-cmp-winner'" if i == winner_idx else ""
cells.append(f"
{fmt(v)}
")
label = label_zh if is_zh else label_en
# Tag info rows so the eye knows "this is a model fact, not a contest".
is_info = better == "info"
label_cls = "lc-cmp-row-label lc-cmp-row-info" if is_info else "lc-cmp-row-label"
tr_cls = " class='lc-cmp-tr-info'" if is_info else ""
rows_html.append(
f"
{_esc(label)}
{''.join(cells)}
"
)
# Aggregate winner — count column wins across "higher/lower" metrics
win_counts = [0] * len(reports)
for label_en, label_zh, getter, better, fmt in metrics:
if better == "info":
continue
values = [getter(r) for r in reports]
numeric_pairs = [(i, v) for i, v in enumerate(values) if isinstance(v, (int, float))]
if not numeric_pairs:
continue
vals_set = {v for _, v in numeric_pairs}
if len(vals_set) <= 1:
continue
if better == "higher":
winner_idx = max(numeric_pairs, key=lambda p: p[1])[0]
else:
winner_idx = min(numeric_pairs, key=lambda p: p[1])[0]
win_counts[winner_idx] += 1
overall_text = ""
if any(win_counts):
max_wins = max(win_counts)
leaders = [reports[i].gpu for i, c in enumerate(win_counts) if c == max_wins]
if len(leaders) == 1:
overall_text = (
f"
"
f"{'综合最优' if is_zh else 'Overall winner'}: "
f"{_esc(leaders[0])} "
f"({max_wins}/{sum(1 for m in metrics if m[3] != 'info')} "
f"{'指标领先' if is_zh else 'metrics lead'})"
f"
"
)
def _render_star_cta(is_zh: bool) -> str:
"""Tail-of-result CTA — shown right after the user got their answer,
which is when satisfaction is highest and the GitHub star ask reads as
'thanks for the tool' rather than 'please give me attention'."""
en_msg = "Saved you GPU-sizing math?"
zh_msg = "省了你 GPU 选型的时间?"
cta_en = "Star on GitHub"
cta_zh = "给个 Star"
text_top = zh_msg if is_zh else en_msg
text_bottom = en_msg if is_zh else zh_msg
cta = f"{cta_zh if is_zh else cta_en} · {cta_en if is_zh else cta_zh}"
return (
""
""
f"
"
""
)
def _render_explain(entries: list[ExplainEntry], is_zh: bool) -> str:
"""Render --explain derivation trace as an HTML accordion."""
if not entries:
return ""
blocks = []
for e in entries:
inputs_html = ""
if e.inputs:
inputs_html = "
"
f"{'结果' if is_zh else 'Result'}: {_esc(e.result)}
"
f"{source_html}"
f"
"
)
return (
"
"
f"
"
f"
{'推导链 (--explain)' if is_zh else 'Derivation trace (--explain)'}
"
+ "".join(blocks)
+ "
"
)
def _render_llm_review(content: str | None, error: str | None, model: str, is_zh: bool) -> str:
if error:
return _render_error(f"LLM review: {error}", is_zh)
if not content:
return ""
# The LLM responds with markdown — convert to a simple HTML block for display.
# gr.HTML doesn't run markdown, but the LLM's headers (## ...) still read OK as text.
safe = _esc(content).replace("\n", " ")
return (
"
"
)
def _render_loading(is_zh: bool) -> str:
msg = (
"正在拉取模型元数据 + 读 safetensors header… 首次大模型约 3-8 秒"
if is_zh
else "Fetching model metadata + reading safetensors header… "
"first lookup of a large model takes 3-8 seconds"
)
return (
"
"
""
f"
{msg}
"
"
"
)
# ---------------------------------------------------------------------------
# Backend handler
_evaluators: dict[str, Evaluator] = {}
def _get_evaluator(source_key: str) -> Evaluator:
"""One evaluator per source — Evaluator caches an HfApi client internally
so we don't want to rebuild it every keystroke."""
if source_key not in _evaluators:
if source_key == "modelscope":
_evaluators[source_key] = Evaluator(source=ModelScopeSource())
else:
_evaluators[source_key] = Evaluator(source=HuggingFaceSource())
return _evaluators[source_key]
def calculate(
model_id: str,
gpu, # list[str] from multiselect; str also tolerated # noqa: ANN001
engine: str,
context_length: int | None,
lang: str,
source: str,
gpu_count: int | None,
input_tokens: int,
output_tokens: int,
target_tps: float,
prefill_util: float,
decode_bw_util: float,
concurrency_degradation: float,
refresh: bool,
explain: bool,
llm_review: bool,
hf_token: str,
ms_token: str,
llm_api_key: str,
llm_base_url: str,
llm_model: str,
) -> tuple[str, str, str]:
"""Returns (main_html, explain_html, llm_review_html)."""
locale = "zh" if lang.startswith("中") else "en"
is_zh = locale == "zh"
# Normalize GPU input. Multiselect returns list; defensive coerce for safety.
if isinstance(gpu, str):
gpu_list = [gpu] if gpu else []
elif isinstance(gpu, (list, tuple)):
gpu_list = [g for g in gpu if g]
else:
gpu_list = []
if not model_id or not model_id.strip():
return (
_render_error(
"请输入模型 ID" if is_zh else "Enter a model id",
is_zh,
),
"",
"",
)
if not gpu_list:
return (_render_error("请选择 GPU" if is_zh else "Pick a GPU", is_zh), "", "")
is_compare = len(gpu_list) >= 2
# Resolve source key. The radio shows e.g. "HuggingFace" / "ModelScope".
src_key = "modelscope" if "modelscope" in source.lower() else "huggingface"
# Inject user-provided tokens into env for the duration of this call only.
# We restore the prior values in the finally block so a token entered for
# one model doesn't leak into the next request from a different user.
token_env_keys = (
"HF_TOKEN",
"HUGGING_FACE_HUB_TOKEN",
"MODELSCOPE_API_TOKEN",
"MODELSCOPE_TOKEN",
)
old_token_env = {k: os.environ.get(k) for k in token_env_keys}
if hf_token and hf_token.strip():
os.environ["HF_TOKEN"] = hf_token.strip()
if ms_token and ms_token.strip():
os.environ["MODELSCOPE_API_TOKEN"] = ms_token.strip()
def _eval_one(g: str) -> EvaluationReport:
return _get_evaluator(src_key).evaluate(
model_id=model_id.strip(),
gpu=g,
engine=engine,
gpu_count=gpu_count if gpu_count and gpu_count > 0 else None,
context_length=context_length if context_length and context_length > 0 else None,
refresh=refresh,
input_tokens=int(input_tokens) if input_tokens else 2000,
output_tokens=int(output_tokens) if output_tokens else 512,
target_tokens_per_sec=float(target_tps) if target_tps else 30.0,
prefill_utilization=float(prefill_util) if prefill_util else 0.40,
decode_bw_utilization=float(decode_bw_util) if decode_bw_util else 0.50,
concurrency_degradation=(
float(concurrency_degradation) if concurrency_degradation else 1.0
),
)
try:
# ---- Compare path: 2-4 GPUs --------------------------------------
if is_compare:
try:
reports = [_eval_one(g) for g in gpu_list]
except Exception as e: # noqa: BLE001
return (_render_error(f"{type(e).__name__}: {e}", is_zh), "", "")
return _render_compare(reports, locale), "", ""
# ---- Single-GPU path (existing flow) ------------------------------
try:
report = _eval_one(gpu_list[0])
except Exception as e: # noqa: BLE001
return (_render_error(f"{type(e).__name__}: {e}", is_zh), "", "")
main_html = _render(report, locale)
explain_html = ""
llm_html = ""
if explain or llm_review:
entries = build_explain(report)
if explain:
explain_html = _render_explain(entries, is_zh)
if llm_review:
# Only set env vars if user actually provided them — never persist
# them in env beyond this call's scope (they live in process env
# for the duration of the call, but we don't persist to disk).
old_env = {
"LLM_CAL_REVIEWER_API_KEY": os.environ.get("LLM_CAL_REVIEWER_API_KEY"),
"LLM_CAL_REVIEWER_BASE_URL": os.environ.get("LLM_CAL_REVIEWER_BASE_URL"),
"LLM_CAL_REVIEWER_MODEL": os.environ.get("LLM_CAL_REVIEWER_MODEL"),
}
try:
if llm_api_key.strip():
os.environ["LLM_CAL_REVIEWER_API_KEY"] = llm_api_key.strip()
if llm_base_url.strip():
os.environ["LLM_CAL_REVIEWER_BASE_URL"] = llm_base_url.strip()
if llm_model.strip():
os.environ["LLM_CAL_REVIEWER_MODEL"] = llm_model.strip()
result = run_review(entries, locale=locale) # type: ignore[arg-type]
finally:
for k, v in old_env.items():
if v is None:
os.environ.pop(k, None)
else:
os.environ[k] = v
llm_html = _render_llm_review(result.content, result.error, result.model, is_zh)
return main_html, explain_html, llm_html
finally:
for k, v in old_token_env.items():
if v is None:
os.environ.pop(k, None)
else:
os.environ[k] = v
def show_loading(lang: str) -> tuple[str, str, str]:
is_zh = lang.startswith("中")
return _render_loading(is_zh), "", ""
# ---------------------------------------------------------------------------
# UI
THEME = gr.themes.Soft(primary_hue="indigo")
HERO_HTML = """