Spaces:

bluemoonsoldout
/

llm-cal

Running

llm-cal / src /llm_cal /core /explain.py

GitHub Actions

Auto-deploy from GitHub Actions

cc6274a 15 days ago

20.5 kB

	"""Full derivation traces for each non-trivial number in the report.

	This module is only invoked when the user passes `--explain`. It doesn't
	recompute anything — it reads the values that the main evaluator already
	produced and wraps them in a formatted explanation with formula, inputs,
	step-by-step computation, and primary source citation.

	Design rationale: the tool's core promise is deterministic, auditable
	output. `--explain` makes that auditability human-readable. A user can:
	1. Read the explanation themselves
	2. Paste it into an LLM and ask "does this math check out?"
	3. Cross-reference docs/methodology.md for the primary source
	All three preserve determinism — the LLM is the user's tool, not ours.
	"""

	from __future__ import annotations

	import math
	from dataclasses import dataclass, field

	from llm_cal.core.evaluator import EvaluationReport


	@dataclass(frozen=True)
	class ExplainInput:
	"""One input variable to a formula."""

	name: str
	value: str # pre-formatted for display
	label: str # e.g. "[verified]", "[estimated]"
	note: str = "" # optional disambiguation


	@dataclass(frozen=True)
	class ExplainEntry:
	"""A full derivation trace for one output number."""

	heading: str # localized section title, e.g. "KV cache @ 128K"
	formula: str # the formula, literally
	inputs: list[ExplainInput] = field(default_factory=list)
	steps: list[str] = field(default_factory=list) # step-by-step computation
	result: str = "" # final formatted answer with label
	source: str = "" # primary source citation
	methodology_anchor: str = "" # anchor in docs/methodology.md, e.g. "#prefill-latency"


	def build(report: EvaluationReport) -> list[ExplainEntry]:
	"""Produce explanation entries in the order they appear in the main report."""
	entries: list[ExplainEntry] = []

	_weight_bytes(report, entries)
	_quantization(report, entries)
	_kv_cache_contexts(report, entries)
	_fleet_tiers(report, entries)
	_prefill(report, entries)
	_decode(report, entries)
	_concurrency(report, entries)

	return entries


	# ======================================================================
	# Weight
	# ======================================================================


	def _weight_bytes(report: EvaluationReport, entries: list[ExplainEntry]) -> None:
	w = report.weight.total_bytes
	entries.append(
	ExplainEntry(
	heading="Weight bytes (safetensors file sum)",
	formula="sum(sibling.size for sibling in HF model_info(files_metadata=True).siblings if sibling.endswith('.safetensors'))",
	inputs=[
	ExplainInput(
	name="HF model_info API",
	value=f"source={report.source}, sha={report.commit_sha or 'HEAD'}",
	label="[verified]",
	),
	],
	steps=[
	f"Raw value from API = {w.value:,} bytes",
	f"= {w.value / 1e9:.2f} GB",
	],
	result=f"{w.value:,} bytes [verified]",
	source=w.source or "HF siblings API",
	methodology_anchor="#weight-bytes",
	)
	)


	def _quantization(report: EvaluationReport, entries: list[ExplainEntry]) -> None:
	r = report.reconciliation
	if not r.candidates:
	return
	best = r.candidates[0]
	cands_table = "\n".join(
	f" {c.scheme:<16} predicted={c.predicted_bytes / 1e9:.2f} GB "
	f"error={c.relative_error * 100:.1f}%"
	for c in r.candidates[:6]
	)
	entries.append(
	ExplainEntry(
	heading="Quantization scheme (reconciliation)",
	formula="best_match = argmin_scheme \|observed_bytes - scheme.bpp × total_params\|",
	inputs=[
	ExplainInput(
	name="observed_bytes",
	value=f"{r.observed_bytes:,}",
	label="[verified]",
	),
	ExplainInput(
	name="total_params",
	value=f"{r.total_params:,}",
	label="[estimated]",
	note="from architecture formula — see '#params-estimate' entry below",
	),
	],
	steps=[
	"For each known quantization scheme, predict total bytes = bpp × params:",
	cands_table,
	f"Winner: {best.scheme} at {best.relative_error * 100:.1f}% error",
	],
	result=f"{r.best.value} [{r.best.label.value}]",
	source="Nearest-anchor match against known bytes-per-param values",
	methodology_anchor="#quantization-scheme",
	)
	)


	# ======================================================================
	# KV cache
	# ======================================================================


	def _kv_cache_contexts(report: EvaluationReport, entries: list[ExplainEntry]) -> None:
	profile = report.profile
	attn = profile.attention
	if attn is None:
	return

	is_mla = attn.variant == "MLA"
	is_csa_hca = attn.variant == "CSA_HCA"

	for ctx, av in report.kv_cache_by_context.items():
	if av.value == 0:
	continue
	# Rebuild the computation for transparency
	if is_mla and attn.kv_lora_rank:
	per_tok_per_layer = attn.kv_lora_rank * 2 # kv_lora_rank × dtype(2)
	formula = "per_tok_per_layer = kv_lora_rank × dtype_bytes (MLA: compressed latent KV)"
	inputs = [
	ExplainInput("kv_lora_rank", str(attn.kv_lora_rank), "[verified]"),
	ExplainInput("dtype_bytes", "2", "[verified]", note="BF16/FP16"),
	ExplainInput("seq_len", f"{ctx:,}", "[verified]"),
	ExplainInput("num_layers", str(profile.num_hidden_layers), "[verified]"),
	]
	else:
	per_tok_per_layer = 2 * attn.num_kv_heads * attn.head_dim * 2
	formula = "per_tok_per_layer = 2 × num_kv_heads × head_dim × dtype_bytes (standard attention)"
	inputs = [
	ExplainInput("num_kv_heads", str(attn.num_kv_heads), "[verified]"),
	ExplainInput("head_dim", str(attn.head_dim), "[verified]"),
	ExplainInput("dtype_bytes", "2", "[verified]", note="BF16/FP16"),
	ExplainInput("seq_len", f"{ctx:,}", "[verified]"),
	ExplainInput("num_layers", str(profile.num_hidden_layers), "[verified]"),
	]

	baseline = per_tok_per_layer * ctx * profile.num_hidden_layers
	steps = [
	f"per_tok_per_layer = {per_tok_per_layer:,} bytes",
	f"baseline = per_tok_per_layer × seq_len × num_layers = {baseline:,} bytes",
	]

	if is_csa_hca and attn.compress_ratios:
	ratios = attn.compress_ratios
	avg = sum(1.0 if r == 0 else 1.0 / r for r in ratios) / len(ratios)
	inputs.append(
	ExplainInput(
	"compress_ratios",
	f"len={len(ratios)} (avg keep-fraction={avg:.4f})",
	"[verified]",
	)
	)
	formula += (
	"\napply_csa_hca: baseline × avg(1/r_i for r_i in compress_ratios, 0 = keep-all=1)"
	)
	steps.extend(
	[
	f"avg_keep_fraction = {avg:.4f}",
	f"result = baseline × avg_keep_fraction = {av.value:,} bytes",
	]
	)
	else:
	steps.append(f"result = baseline = {av.value:,} bytes")

	entries.append(
	ExplainEntry(
	heading=f"KV cache @ {_fmt_ctx(ctx)} context",
	formula=formula,
	inputs=inputs,
	steps=steps,
	result=f"{av.value:,} bytes = {av.value / 1e9:.2f} GB [{av.label.value}]",
	source=(
	"DeepSeek-V2 paper (MLA); DeepSeek-V4 tech report (CSA+HCA); "
	"standard attention formula per Attention Is All You Need (Vaswani 2017)"
	),
	methodology_anchor="#kv-cache-per-request",
	)
	)


	# ======================================================================
	# Fleet tiers
	# ======================================================================


	def _fleet_tiers(report: EvaluationReport, entries: list[ExplainEntry]) -> None:
	if report.fleet is None or report.gpu_spec is None:
	return

	# One explain block per tier (min / dev / prod)
	for opt in report.fleet.options:
	tier_label = opt.tier
	headroom = opt.usable_bytes_per_gpu - opt.weight_bytes_per_gpu
	steps = [
	f"per-GPU HBM usable (@ 90% util) = {opt.usable_bytes_per_gpu:,} bytes",
	f"weight per GPU = total_weight / TP_size = "
	f"{report.weight.total_bytes.value:,} / {opt.gpu_count} = "
	f"{opt.weight_bytes_per_gpu:,} bytes",
	f"headroom per GPU = usable - weight = {headroom:,} bytes ({headroom / 1e9:.2f} GB)",
	]
	fit_criterion = {"min": 1, "dev": 8, "prod": 16}.get(tier_label, 1)
	steps.append(
	f"tier criterion: headroom ≥ weight_per_gpu + {fit_criterion} × kv_per_request_128K"
	)
	steps.append(
	f"smallest TP count in {list(report.fleet.valid_tp_sizes)} that "
	f"satisfies the criterion: {opt.gpu_count}"
	)
	if not opt.fits:
	steps.append(
	f"NOTE: does not fit the criterion — the chosen {opt.gpu_count} "
	"is the best available."
	)

	entries.append(
	ExplainEntry(
	heading=f"Fleet tier: {tier_label} ({opt.gpu_count} GPUs)",
	formula=(
	"smallest TP in valid_set where "
	"weight_per_gpu + concurrent × kv_per_request ≤ usable_per_gpu"
	),
	inputs=[
	ExplainInput(
	"total_weight_bytes",
	f"{report.weight.total_bytes.value:,}",
	"[verified]",
	),
	ExplainInput(
	"valid_TP_sizes",
	str(list(report.fleet.valid_tp_sizes)),
	"[estimated]",
	note="divisors of num_attention_heads capped at 8 (single node)",
	),
	ExplainInput(
	"GPU memory_gb",
	f"{report.gpu_spec.memory_gb} GB",
	"[verified]",
	),
	],
	steps=steps,
	result=f"{opt.gpu_count} GPUs, fit={opt.fits}",
	source="vLLM --gpu-memory-utilization 0.9 convention; TP divisibility required by vLLM/SGLang",
	methodology_anchor="#tp-aware-kv-sharding",
	)
	)


	# ======================================================================
	# Prefill
	# ======================================================================


	def _prefill(report: EvaluationReport, entries: list[ExplainEntry]) -> None:
	if (
	report.prefill is None
	or report.gpu_spec is None
	or report.fleet is None
	or report.perf_input_tokens is None
	):
	return
	p = report.prefill
	# Figure out chosen GPU count from the fleet
	chosen = next(
	(o.gpu_count for o in report.fleet.options if o.tier == report.fleet.best_tier),
	report.fleet.options[0].gpu_count,
	)
	entries.append(
	ExplainEntry(
	heading="Prefill latency (single request)",
	formula=(
	"FLOPs = 2 × params × input_tokens\n"
	"effective_TFLOPS = peak_fp16_TFLOPS × num_gpus × utilization\n"
	"latency_ms = (FLOPs / (effective_TFLOPS × 1e12)) × 1000"
	),
	inputs=[
	ExplainInput(
	"params",
	f"{report.total_params_estimate.value:,}",
	"[estimated]",
	note="from architecture formula (see weight.py)",
	),
	ExplainInput("input_tokens", f"{report.perf_input_tokens:,}", "[user-set]"),
	ExplainInput(
	"peak_fp16_TFLOPS",
	f"{report.gpu_spec.fp16_tflops}",
	"[verified]",
	note=f"from GPU database, {report.gpu_spec.id} spec",
	),
	ExplainInput("num_gpus", f"{chosen}", "[estimated]"),
	ExplainInput(
	"utilization",
	f"{p.utilization:.2f}",
	"[user-set]",
	note="empirical MFU, default 0.40 — override with --prefill-util",
	),
	],
	steps=[
	f"FLOPs = 2 × {report.total_params_estimate.value:,} × "
	f"{report.perf_input_tokens:,} = {p.total_flops.value:.3e}",
	f"effective_TFLOPS = {report.gpu_spec.fp16_tflops} × {chosen} × "
	f"{p.utilization:.2f} = {p.peak_effective_tflops.value:.1f}",
	f"latency = {p.total_flops.value:.3e} / "
	f"({p.peak_effective_tflops.value:.1f} × 1e12) × 1000 = "
	f"{p.latency_ms.value:.1f} ms",
	],
	result=f"{p.latency_ms.value:.1f} ms [{p.latency_ms.label.value}]",
	source="Kaplan et al. 2020 'Scaling Laws for Neural Language Models' (arxiv.org/abs/2001.08361)",
	methodology_anchor="#prefill-latency",
	)
	)


	# ======================================================================
	# Decode
	# ======================================================================


	def _decode(report: EvaluationReport, entries: list[ExplainEntry]) -> None:
	if report.decode is None or report.gpu_spec is None or report.fleet is None:
	return
	d = report.decode
	bw = report.gpu_spec.memory_bandwidth_gbps or 0
	chosen = next(
	(o.gpu_count for o in report.fleet.options if o.tier == report.fleet.best_tier),
	report.fleet.options[0].gpu_count,
	)
	weight_per_gpu = d.active_weight_bytes_per_gpu.value
	effective_bw_gbs = bw * d.bw_utilization
	steps = [
	f"weight_per_gpu = {report.weight.total_bytes.value:,} / {chosen} = "
	f"{weight_per_gpu:,} bytes ({weight_per_gpu / 1e9:.2f} GB)",
	f"effective_bw = {bw} × {d.bw_utilization:.2f} = {effective_bw_gbs:.0f} GB/s",
	f"per_gpu_tok_per_sec = effective_bw / weight_per_gpu = "
	f"{effective_bw_gbs * 1e9 / weight_per_gpu:.1f} tok/s",
	f"cluster_tok_per_sec = per_gpu × {chosen} × "
	f"{d.cluster_comm_efficiency:.2f} = {d.cluster_tokens_per_sec.value:.1f} tok/s",
	]
	entries.append(
	ExplainEntry(
	heading="Decode throughput (cluster)",
	formula=(
	"per_gpu_tok_per_sec = memory_bandwidth × bw_util / weight_bytes_per_gpu\n"
	"cluster_tok_per_sec = per_gpu × num_gpus × cluster_comm_efficiency"
	),
	inputs=[
	ExplainInput(
	"GPU memory_bandwidth_gbps",
	f"{bw}",
	"[verified]",
	note=f"from GPU database, {report.gpu_spec.id}",
	),
	ExplainInput(
	"bw_util",
	f"{d.bw_utilization:.2f}",
	"[user-set]",
	note="empirical, default 0.50 — override with --decode-bw-util",
	),
	ExplainInput("weight_bytes_per_gpu", f"{weight_per_gpu:,}", "[estimated]"),
	ExplainInput("num_gpus", f"{chosen}", "[estimated]"),
	ExplainInput(
	"cluster_comm_efficiency",
	f"{d.cluster_comm_efficiency:.2f}",
	"[user-set]",
	note="NCCL AllReduce efficiency on NVLink, default 0.90",
	),
	],
	steps=steps,
	result=f"{d.cluster_tokens_per_sec.value:.1f} tok/s [estimated]",
	source="vLLM paper (Kwon et al. SOSP 2023, arxiv.org/abs/2309.06180)",
	methodology_anchor="#decode-tokens-per-second",
	)
	)


	# ======================================================================
	# Concurrency bounds
	# ======================================================================


	def _concurrency(report: EvaluationReport, entries: list[ExplainEntry]) -> None:
	if report.concurrency is None:
	return
	c = report.concurrency
	entries.append(
	ExplainEntry(
	heading="K bound (memory capacity)",
	formula="K = floor(per_GPU_headroom_bytes / per_GPU_kv_bytes_per_request)",
	inputs=[
	ExplainInput(
	"per_GPU_headroom_bytes",
	f"{c.k_source_headroom_bytes:,}",
	"[estimated]",
	),
	ExplainInput(
	"per_GPU_kv_bytes_per_request",
	f"{c.k_source_kv_per_req_bytes:,}",
	"[estimated]",
	note="post-TP-sharding via min(tp, num_kv_heads)",
	),
	],
	steps=[
	f"K = floor({c.k_source_headroom_bytes:,} / "
	f"{c.k_source_kv_per_req_bytes:,}) = {c.k_bound.value}",
	],
	result=f"K = {c.k_bound.value} [{c.k_bound.label.value}]",
	source="TP sharding rule from vLLM source code (verified)",
	methodology_anchor="#k-bound-memory-capacity",
	)
	)
	l_tps = report.decode.cluster_tokens_per_sec.value if report.decode else 0
	entries.append(
	ExplainEntry(
	heading="L bound (compute/bandwidth at SLA)",
	formula=(
	"L = floor(cluster_tok_per_sec / target_per_user_tok_per_sec / degradation_factor)"
	),
	inputs=[
	ExplainInput("cluster_tok_per_sec", f"{l_tps:.1f}", "[estimated]"),
	ExplainInput(
	"target_per_user_tok_per_sec",
	f"{c.target_tokens_per_sec:.1f}",
	"[user-set]",
	note="SLA, override with --target-tokens-per-sec",
	),
	ExplainInput(
	"degradation_factor",
	f"{c.degradation_factor:.2f}",
	"[user-set]",
	note="default 1.0 = no degradation; override with --concurrency-degradation",
	),
	],
	steps=[
	f"L = floor({l_tps:.1f} / {c.target_tokens_per_sec:.1f} / "
	f"{c.degradation_factor:.2f}) = {c.l_bound.value}",
	],
	result=f"L = {c.l_bound.value} [{c.l_bound.label.value}]",
	source="Standard SLA-based capacity planning",
	methodology_anchor="#l-bound-compute-bandwidth-at-sla",
	)
	)
	entries.append(
	ExplainEntry(
	heading="Max concurrent + bottleneck verdict",
	formula="max_concurrent = min(K, L); bottleneck = 'memory_capacity' if K ≤ L else 'memory_bandwidth / compute'",
	inputs=[
	ExplainInput("K", str(c.k_bound.value), f"[{c.k_bound.label.value}]"),
	ExplainInput("L", str(c.l_bound.value), f"[{c.l_bound.label.value}]"),
	],
	steps=[
	f"max_concurrent = min(K={c.k_bound.value}, L={c.l_bound.value}) = "
	f"{c.max_concurrent.value}",
	f"bottleneck = {c.bottleneck}",
	],
	result=(f"{c.max_concurrent.value} concurrent, bottleneck = {c.bottleneck}"),
	source=c.bottleneck_reason_en,
	methodology_anchor="#concurrency-bounds-k-l",
	)
	)
	# Sanity check to silence "unused math import" if no steps triggered math.
	_ = math.floor(0)


	# ======================================================================
	# Helpers
	# ======================================================================


	def _fmt_ctx(ctx: int) -> str:
	if ctx >= 1_000_000:
	return f"{ctx // 1_000_000}M"
	if ctx >= 1024:
	return f"{ctx // 1024}K"
	return str(ctx)