Spaces:
Running
Running
| """CLI entry point. Thin shell over `Evaluator` + rich formatter.""" | |
| from __future__ import annotations | |
| import sys | |
| import typer | |
| from rich.console import Console | |
| from llm_cal.benchmark.runner import exit_code_from, render_results, run_all | |
| from llm_cal.common.i18n import detect_locale_from_env, get_locale, set_locale, t | |
| from llm_cal.core.evaluator import Evaluator | |
| from llm_cal.core.explain import build as build_explain | |
| from llm_cal.hardware.loader import load_database | |
| from llm_cal.llm_review.reviewer import run_review | |
| from llm_cal.model_source.base import ( | |
| AuthRequiredError, | |
| ModelNotFoundError, | |
| ModelSource, | |
| SourceUnavailableError, | |
| ) | |
| from llm_cal.model_source.huggingface import HuggingFaceSource | |
| from llm_cal.model_source.modelscope import ModelScopeSource | |
| from llm_cal.output.formatter import ( | |
| render, | |
| render_explain, | |
| render_gpu_list, | |
| render_llm_review, | |
| ) | |
| # Set locale from env first; --lang flag can override inside main() | |
| set_locale(detect_locale_from_env()) | |
| app = typer.Typer( | |
| name="llm-cal", | |
| help="LLM inference hardware calculator.", | |
| no_args_is_help=True, | |
| ) | |
| _console = Console() | |
| _err = Console(stderr=True) | |
| def main( | |
| model_id: str | None = typer.Argument(None, help="HuggingFace or ModelScope model id"), | |
| gpu: str | None = typer.Option(None, "--gpu", help="GPU type, e.g. H800, A100-80G"), | |
| engine: str = typer.Option("vllm", "--engine", help="Inference engine: vllm | sglang"), | |
| gpu_count: int | None = typer.Option( | |
| None, "--gpu-count", help="Force GPU count (otherwise tool recommends)" | |
| ), | |
| context_length: int | None = typer.Option( | |
| None, "--context-length", help="Context length for KV cache estimation" | |
| ), | |
| refresh: bool = typer.Option(False, "--refresh", help="Bypass cache and re-fetch"), | |
| lang: str | None = typer.Option( | |
| None, | |
| "--lang", | |
| help="Output language: en | zh (default auto-detects from LANG env)", | |
| ), | |
| list_gpus: bool = typer.Option( | |
| False, | |
| "--list-gpus", | |
| help="List all supported GPUs and exit (no model_id needed)", | |
| ), | |
| benchmark: bool = typer.Option( | |
| False, | |
| "--benchmark", | |
| help=( | |
| "Run the curated benchmark dataset: compare tool output against " | |
| "reference values from HF API, model cards, vLLM recipes. " | |
| "Requires network. Exit 0 on all-pass, 1 if any FAIL." | |
| ), | |
| ), | |
| input_tokens: int = typer.Option( | |
| 2000, | |
| "--input-tokens", | |
| help="Input token budget for prefill-latency estimation (default: 2000).", | |
| ), | |
| output_tokens: int = typer.Option( | |
| 512, | |
| "--output-tokens", | |
| help="Output token budget for total-latency math (default: 512).", | |
| ), | |
| target_tokens_per_sec: float = typer.Option( | |
| 30.0, | |
| "--target-tokens-per-sec", | |
| help="SLA: per-user decode tokens/second (drives L bound). Default: 30.", | |
| ), | |
| prefill_util: float = typer.Option( | |
| 0.40, | |
| "--prefill-util", | |
| help="Compute utilization factor for prefill (empirical, default 0.40).", | |
| ), | |
| decode_bw_util: float = typer.Option( | |
| 0.50, | |
| "--decode-bw-util", | |
| help="Memory-bandwidth utilization factor for decode (default 0.50).", | |
| ), | |
| concurrency_degradation: float = typer.Option( | |
| 1.0, | |
| "--concurrency-degradation", | |
| help=( | |
| "High-concurrency throughput degradation factor (default 1.0 = " | |
| "no degradation — the honest baseline). If your engine drops " | |
| "to 60% efficiency under load, pass 1.67. See docs/methodology.md." | |
| ), | |
| ), | |
| explain: bool = typer.Option( | |
| False, | |
| "--explain", | |
| help=( | |
| "Print the full derivation trace (formula, inputs, step-by-step, " | |
| "source) for every non-trivial number. Feed the output to an LLM " | |
| "if you want a second opinion on the math." | |
| ), | |
| ), | |
| llm_review: bool = typer.Option( | |
| False, | |
| "--llm-review", | |
| help=( | |
| "EXPERIMENTAL: send the derivation trace to an LLM for a second " | |
| "opinion. Output is tagged [llm-opinion] and never overrides the " | |
| "6 primary labels. Requires env vars: LLM_CAL_REVIEWER_API_KEY " | |
| "(required), LLM_CAL_REVIEWER_BASE_URL (default OpenAI), " | |
| "LLM_CAL_REVIEWER_MODEL (default gpt-4o)." | |
| ), | |
| ), | |
| source: str = typer.Option( | |
| "huggingface", | |
| "--source", | |
| help=( | |
| "Model source: huggingface (default) | modelscope. " | |
| "Auth via HF_TOKEN or MODELSCOPE_API_TOKEN env var." | |
| ), | |
| ), | |
| ) -> None: | |
| """Evaluate a model against target hardware.""" | |
| if lang in ("en", "zh"): | |
| set_locale(lang) # type: ignore[arg-type] | |
| # Meta commands short-circuit before requiring model_id + --gpu. | |
| if list_gpus: | |
| render_gpu_list(load_database(), _console) | |
| return | |
| if benchmark: | |
| results = run_all() | |
| render_results(results, _console) | |
| sys.exit(exit_code_from(results)) | |
| if not model_id: | |
| _err.print("[red]Missing argument MODEL_ID. Use --help for usage.[/red]") | |
| raise typer.Exit(code=1) | |
| if not gpu: | |
| _err.print("[red]Missing option --gpu. Use --list-gpus to see choices.[/red]") | |
| raise typer.Exit(code=1) | |
| src_obj: ModelSource | |
| src_lower = source.lower() | |
| if src_lower in ("hf", "huggingface"): | |
| src_obj = HuggingFaceSource() | |
| elif src_lower in ("ms", "modelscope"): | |
| src_obj = ModelScopeSource() | |
| else: | |
| _err.print( | |
| f"[red]Unknown --source '{source}'. Use 'huggingface' or 'modelscope'.[/red]" | |
| ) | |
| raise typer.Exit(code=1) | |
| evaluator = Evaluator(source=src_obj) | |
| try: | |
| report = evaluator.evaluate( | |
| model_id=model_id, | |
| gpu=gpu, | |
| engine=engine, | |
| gpu_count=gpu_count, | |
| context_length=context_length, | |
| refresh=refresh, | |
| input_tokens=input_tokens, | |
| output_tokens=output_tokens, | |
| target_tokens_per_sec=target_tokens_per_sec, | |
| prefill_utilization=prefill_util, | |
| decode_bw_utilization=decode_bw_util, | |
| concurrency_degradation=concurrency_degradation, | |
| ) | |
| except AuthRequiredError as e: | |
| _err.print(f"[bold red]{t('cli.err.auth_required')}[/bold red] {e}") | |
| sys.exit(2) | |
| except ModelNotFoundError as e: | |
| _err.print(f"[bold red]{t('cli.err.model_not_found')}[/bold red] {e}") | |
| sys.exit(3) | |
| except SourceUnavailableError as e: | |
| _err.print(f"[bold red]{t('cli.err.source_unavailable')}[/bold red] {e}") | |
| sys.exit(4) | |
| render(report, _console) | |
| explain_entries = build_explain(report) if (explain or llm_review) else [] | |
| if explain: | |
| render_explain(explain_entries, _console) | |
| if llm_review: | |
| # Locale at this point has been resolved by set_locale() calls above. | |
| result = run_review(explain_entries, locale=get_locale()) | |
| render_llm_review(result, _console) | |
| if __name__ == "__main__": | |
| app() | |