Spaces:

bluemoonsoldout
/

llm-cal

Running

GitHub Actions

Auto-deploy from GitHub Actions

cc6274a 11 days ago

7.19 kB

	"""CLI entry point. Thin shell over `Evaluator` + rich formatter."""

	from __future__ import annotations

	import sys

	import typer
	from rich.console import Console

	from llm_cal.benchmark.runner import exit_code_from, render_results, run_all
	from llm_cal.common.i18n import detect_locale_from_env, get_locale, set_locale, t
	from llm_cal.core.evaluator import Evaluator
	from llm_cal.core.explain import build as build_explain
	from llm_cal.hardware.loader import load_database
	from llm_cal.llm_review.reviewer import run_review
	from llm_cal.model_source.base import (
	AuthRequiredError,
	ModelNotFoundError,
	ModelSource,
	SourceUnavailableError,
	)
	from llm_cal.model_source.huggingface import HuggingFaceSource
	from llm_cal.model_source.modelscope import ModelScopeSource
	from llm_cal.output.formatter import (
	render,
	render_explain,
	render_gpu_list,
	render_llm_review,
	)

	# Set locale from env first; --lang flag can override inside main()
	set_locale(detect_locale_from_env())

	app = typer.Typer(
	name="llm-cal",
	help="LLM inference hardware calculator.",
	no_args_is_help=True,
	)
	_console = Console()
	_err = Console(stderr=True)


	@app.command()
	def main(
	model_id: str \| None = typer.Argument(None, help="HuggingFace or ModelScope model id"),
	gpu: str \| None = typer.Option(None, "--gpu", help="GPU type, e.g. H800, A100-80G"),
	engine: str = typer.Option("vllm", "--engine", help="Inference engine: vllm \| sglang"),
	gpu_count: int \| None = typer.Option(
	None, "--gpu-count", help="Force GPU count (otherwise tool recommends)"
	),
	context_length: int \| None = typer.Option(
	None, "--context-length", help="Context length for KV cache estimation"
	),
	refresh: bool = typer.Option(False, "--refresh", help="Bypass cache and re-fetch"),
	lang: str \| None = typer.Option(
	None,
	"--lang",
	help="Output language: en \| zh (default auto-detects from LANG env)",
	),
	list_gpus: bool = typer.Option(
	False,
	"--list-gpus",
	help="List all supported GPUs and exit (no model_id needed)",
	),
	benchmark: bool = typer.Option(
	False,
	"--benchmark",
	help=(
	"Run the curated benchmark dataset: compare tool output against "
	"reference values from HF API, model cards, vLLM recipes. "
	"Requires network. Exit 0 on all-pass, 1 if any FAIL."
	),
	),
	input_tokens: int = typer.Option(
	2000,
	"--input-tokens",
	help="Input token budget for prefill-latency estimation (default: 2000).",
	),
	output_tokens: int = typer.Option(
	512,
	"--output-tokens",
	help="Output token budget for total-latency math (default: 512).",
	),
	target_tokens_per_sec: float = typer.Option(
	30.0,
	"--target-tokens-per-sec",
	help="SLA: per-user decode tokens/second (drives L bound). Default: 30.",
	),
	prefill_util: float = typer.Option(
	0.40,
	"--prefill-util",
	help="Compute utilization factor for prefill (empirical, default 0.40).",
	),
	decode_bw_util: float = typer.Option(
	0.50,
	"--decode-bw-util",
	help="Memory-bandwidth utilization factor for decode (default 0.50).",
	),
	concurrency_degradation: float = typer.Option(
	1.0,
	"--concurrency-degradation",
	help=(
	"High-concurrency throughput degradation factor (default 1.0 = "
	"no degradation — the honest baseline). If your engine drops "
	"to 60% efficiency under load, pass 1.67. See docs/methodology.md."
	),
	),
	explain: bool = typer.Option(
	False,
	"--explain",
	help=(
	"Print the full derivation trace (formula, inputs, step-by-step, "
	"source) for every non-trivial number. Feed the output to an LLM "
	"if you want a second opinion on the math."
	),
	),
	llm_review: bool = typer.Option(
	False,
	"--llm-review",
	help=(
	"EXPERIMENTAL: send the derivation trace to an LLM for a second "
	"opinion. Output is tagged [llm-opinion] and never overrides the "
	"6 primary labels. Requires env vars: LLM_CAL_REVIEWER_API_KEY "
	"(required), LLM_CAL_REVIEWER_BASE_URL (default OpenAI), "
	"LLM_CAL_REVIEWER_MODEL (default gpt-4o)."
	),
	),
	source: str = typer.Option(
	"huggingface",
	"--source",
	help=(
	"Model source: huggingface (default) \| modelscope. "
	"Auth via HF_TOKEN or MODELSCOPE_API_TOKEN env var."
	),
	),
	) -> None:
	"""Evaluate a model against target hardware."""
	if lang in ("en", "zh"):
	set_locale(lang) # type: ignore[arg-type]

	# Meta commands short-circuit before requiring model_id + --gpu.
	if list_gpus:
	render_gpu_list(load_database(), _console)
	return

	if benchmark:
	results = run_all()
	render_results(results, _console)
	sys.exit(exit_code_from(results))

	if not model_id:
	_err.print("[red]Missing argument MODEL_ID. Use --help for usage.[/red]")
	raise typer.Exit(code=1)
	if not gpu:
	_err.print("[red]Missing option --gpu. Use --list-gpus to see choices.[/red]")
	raise typer.Exit(code=1)

	src_obj: ModelSource
	src_lower = source.lower()
	if src_lower in ("hf", "huggingface"):
	src_obj = HuggingFaceSource()
	elif src_lower in ("ms", "modelscope"):
	src_obj = ModelScopeSource()
	else:
	_err.print(
	f"[red]Unknown --source '{source}'. Use 'huggingface' or 'modelscope'.[/red]"
	)
	raise typer.Exit(code=1)

	evaluator = Evaluator(source=src_obj)
	try:
	report = evaluator.evaluate(
	model_id=model_id,
	gpu=gpu,
	engine=engine,
	gpu_count=gpu_count,
	context_length=context_length,
	refresh=refresh,
	input_tokens=input_tokens,
	output_tokens=output_tokens,
	target_tokens_per_sec=target_tokens_per_sec,
	prefill_utilization=prefill_util,
	decode_bw_utilization=decode_bw_util,
	concurrency_degradation=concurrency_degradation,
	)
	except AuthRequiredError as e:
	_err.print(f"[bold red]{t('cli.err.auth_required')}[/bold red] {e}")
	sys.exit(2)
	except ModelNotFoundError as e:
	_err.print(f"[bold red]{t('cli.err.model_not_found')}[/bold red] {e}")
	sys.exit(3)
	except SourceUnavailableError as e:
	_err.print(f"[bold red]{t('cli.err.source_unavailable')}[/bold red] {e}")
	sys.exit(4)

	render(report, _console)
	explain_entries = build_explain(report) if (explain or llm_review) else []
	if explain:
	render_explain(explain_entries, _console)
	if llm_review:
	# Locale at this point has been resolved by set_locale() calls above.
	result = run_review(explain_entries, locale=get_locale())
	render_llm_review(result, _console)


	if __name__ == "__main__":
	app()