"""CLI entry point. Thin shell over `Evaluator` + rich formatter.""" from __future__ import annotations import sys import typer from rich.console import Console from llm_cal.benchmark.runner import exit_code_from, render_results, run_all from llm_cal.common.i18n import detect_locale_from_env, get_locale, set_locale, t from llm_cal.core.evaluator import Evaluator from llm_cal.core.explain import build as build_explain from llm_cal.hardware.loader import load_database from llm_cal.llm_review.reviewer import run_review from llm_cal.model_source.base import ( AuthRequiredError, ModelNotFoundError, ModelSource, SourceUnavailableError, ) from llm_cal.model_source.huggingface import HuggingFaceSource from llm_cal.model_source.modelscope import ModelScopeSource from llm_cal.output.formatter import ( render, render_explain, render_gpu_list, render_llm_review, ) # Set locale from env first; --lang flag can override inside main() set_locale(detect_locale_from_env()) app = typer.Typer( name="llm-cal", help="LLM inference hardware calculator.", no_args_is_help=True, ) _console = Console() _err = Console(stderr=True) @app.command() def main( model_id: str | None = typer.Argument(None, help="HuggingFace or ModelScope model id"), gpu: str | None = typer.Option(None, "--gpu", help="GPU type, e.g. H800, A100-80G"), engine: str = typer.Option("vllm", "--engine", help="Inference engine: vllm | sglang"), gpu_count: int | None = typer.Option( None, "--gpu-count", help="Force GPU count (otherwise tool recommends)" ), context_length: int | None = typer.Option( None, "--context-length", help="Context length for KV cache estimation" ), refresh: bool = typer.Option(False, "--refresh", help="Bypass cache and re-fetch"), lang: str | None = typer.Option( None, "--lang", help="Output language: en | zh (default auto-detects from LANG env)", ), list_gpus: bool = typer.Option( False, "--list-gpus", help="List all supported GPUs and exit (no model_id needed)", ), benchmark: bool = typer.Option( False, "--benchmark", help=( "Run the curated benchmark dataset: compare tool output against " "reference values from HF API, model cards, vLLM recipes. " "Requires network. Exit 0 on all-pass, 1 if any FAIL." ), ), input_tokens: int = typer.Option( 2000, "--input-tokens", help="Input token budget for prefill-latency estimation (default: 2000).", ), output_tokens: int = typer.Option( 512, "--output-tokens", help="Output token budget for total-latency math (default: 512).", ), target_tokens_per_sec: float = typer.Option( 30.0, "--target-tokens-per-sec", help="SLA: per-user decode tokens/second (drives L bound). Default: 30.", ), prefill_util: float = typer.Option( 0.40, "--prefill-util", help="Compute utilization factor for prefill (empirical, default 0.40).", ), decode_bw_util: float = typer.Option( 0.50, "--decode-bw-util", help="Memory-bandwidth utilization factor for decode (default 0.50).", ), concurrency_degradation: float = typer.Option( 1.0, "--concurrency-degradation", help=( "High-concurrency throughput degradation factor (default 1.0 = " "no degradation — the honest baseline). If your engine drops " "to 60% efficiency under load, pass 1.67. See docs/methodology.md." ), ), explain: bool = typer.Option( False, "--explain", help=( "Print the full derivation trace (formula, inputs, step-by-step, " "source) for every non-trivial number. Feed the output to an LLM " "if you want a second opinion on the math." ), ), llm_review: bool = typer.Option( False, "--llm-review", help=( "EXPERIMENTAL: send the derivation trace to an LLM for a second " "opinion. Output is tagged [llm-opinion] and never overrides the " "6 primary labels. Requires env vars: LLM_CAL_REVIEWER_API_KEY " "(required), LLM_CAL_REVIEWER_BASE_URL (default OpenAI), " "LLM_CAL_REVIEWER_MODEL (default gpt-4o)." ), ), source: str = typer.Option( "huggingface", "--source", help=( "Model source: huggingface (default) | modelscope. " "Auth via HF_TOKEN or MODELSCOPE_API_TOKEN env var." ), ), ) -> None: """Evaluate a model against target hardware.""" if lang in ("en", "zh"): set_locale(lang) # type: ignore[arg-type] # Meta commands short-circuit before requiring model_id + --gpu. if list_gpus: render_gpu_list(load_database(), _console) return if benchmark: results = run_all() render_results(results, _console) sys.exit(exit_code_from(results)) if not model_id: _err.print("[red]Missing argument MODEL_ID. Use --help for usage.[/red]") raise typer.Exit(code=1) if not gpu: _err.print("[red]Missing option --gpu. Use --list-gpus to see choices.[/red]") raise typer.Exit(code=1) src_obj: ModelSource src_lower = source.lower() if src_lower in ("hf", "huggingface"): src_obj = HuggingFaceSource() elif src_lower in ("ms", "modelscope"): src_obj = ModelScopeSource() else: _err.print( f"[red]Unknown --source '{source}'. Use 'huggingface' or 'modelscope'.[/red]" ) raise typer.Exit(code=1) evaluator = Evaluator(source=src_obj) try: report = evaluator.evaluate( model_id=model_id, gpu=gpu, engine=engine, gpu_count=gpu_count, context_length=context_length, refresh=refresh, input_tokens=input_tokens, output_tokens=output_tokens, target_tokens_per_sec=target_tokens_per_sec, prefill_utilization=prefill_util, decode_bw_utilization=decode_bw_util, concurrency_degradation=concurrency_degradation, ) except AuthRequiredError as e: _err.print(f"[bold red]{t('cli.err.auth_required')}[/bold red] {e}") sys.exit(2) except ModelNotFoundError as e: _err.print(f"[bold red]{t('cli.err.model_not_found')}[/bold red] {e}") sys.exit(3) except SourceUnavailableError as e: _err.print(f"[bold red]{t('cli.err.source_unavailable')}[/bold red] {e}") sys.exit(4) render(report, _console) explain_entries = build_explain(report) if (explain or llm_review) else [] if explain: render_explain(explain_entries, _console) if llm_review: # Locale at this point has been resolved by set_locale() calls above. result = run_review(explain_entries, locale=get_locale()) render_llm_review(result, _console) if __name__ == "__main__": app()