Spaces:
Running
Running
File size: 7,188 Bytes
cc6274a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 | """CLI entry point. Thin shell over `Evaluator` + rich formatter."""
from __future__ import annotations
import sys
import typer
from rich.console import Console
from llm_cal.benchmark.runner import exit_code_from, render_results, run_all
from llm_cal.common.i18n import detect_locale_from_env, get_locale, set_locale, t
from llm_cal.core.evaluator import Evaluator
from llm_cal.core.explain import build as build_explain
from llm_cal.hardware.loader import load_database
from llm_cal.llm_review.reviewer import run_review
from llm_cal.model_source.base import (
AuthRequiredError,
ModelNotFoundError,
ModelSource,
SourceUnavailableError,
)
from llm_cal.model_source.huggingface import HuggingFaceSource
from llm_cal.model_source.modelscope import ModelScopeSource
from llm_cal.output.formatter import (
render,
render_explain,
render_gpu_list,
render_llm_review,
)
# Set locale from env first; --lang flag can override inside main()
set_locale(detect_locale_from_env())
app = typer.Typer(
name="llm-cal",
help="LLM inference hardware calculator.",
no_args_is_help=True,
)
_console = Console()
_err = Console(stderr=True)
@app.command()
def main(
model_id: str | None = typer.Argument(None, help="HuggingFace or ModelScope model id"),
gpu: str | None = typer.Option(None, "--gpu", help="GPU type, e.g. H800, A100-80G"),
engine: str = typer.Option("vllm", "--engine", help="Inference engine: vllm | sglang"),
gpu_count: int | None = typer.Option(
None, "--gpu-count", help="Force GPU count (otherwise tool recommends)"
),
context_length: int | None = typer.Option(
None, "--context-length", help="Context length for KV cache estimation"
),
refresh: bool = typer.Option(False, "--refresh", help="Bypass cache and re-fetch"),
lang: str | None = typer.Option(
None,
"--lang",
help="Output language: en | zh (default auto-detects from LANG env)",
),
list_gpus: bool = typer.Option(
False,
"--list-gpus",
help="List all supported GPUs and exit (no model_id needed)",
),
benchmark: bool = typer.Option(
False,
"--benchmark",
help=(
"Run the curated benchmark dataset: compare tool output against "
"reference values from HF API, model cards, vLLM recipes. "
"Requires network. Exit 0 on all-pass, 1 if any FAIL."
),
),
input_tokens: int = typer.Option(
2000,
"--input-tokens",
help="Input token budget for prefill-latency estimation (default: 2000).",
),
output_tokens: int = typer.Option(
512,
"--output-tokens",
help="Output token budget for total-latency math (default: 512).",
),
target_tokens_per_sec: float = typer.Option(
30.0,
"--target-tokens-per-sec",
help="SLA: per-user decode tokens/second (drives L bound). Default: 30.",
),
prefill_util: float = typer.Option(
0.40,
"--prefill-util",
help="Compute utilization factor for prefill (empirical, default 0.40).",
),
decode_bw_util: float = typer.Option(
0.50,
"--decode-bw-util",
help="Memory-bandwidth utilization factor for decode (default 0.50).",
),
concurrency_degradation: float = typer.Option(
1.0,
"--concurrency-degradation",
help=(
"High-concurrency throughput degradation factor (default 1.0 = "
"no degradation — the honest baseline). If your engine drops "
"to 60% efficiency under load, pass 1.67. See docs/methodology.md."
),
),
explain: bool = typer.Option(
False,
"--explain",
help=(
"Print the full derivation trace (formula, inputs, step-by-step, "
"source) for every non-trivial number. Feed the output to an LLM "
"if you want a second opinion on the math."
),
),
llm_review: bool = typer.Option(
False,
"--llm-review",
help=(
"EXPERIMENTAL: send the derivation trace to an LLM for a second "
"opinion. Output is tagged [llm-opinion] and never overrides the "
"6 primary labels. Requires env vars: LLM_CAL_REVIEWER_API_KEY "
"(required), LLM_CAL_REVIEWER_BASE_URL (default OpenAI), "
"LLM_CAL_REVIEWER_MODEL (default gpt-4o)."
),
),
source: str = typer.Option(
"huggingface",
"--source",
help=(
"Model source: huggingface (default) | modelscope. "
"Auth via HF_TOKEN or MODELSCOPE_API_TOKEN env var."
),
),
) -> None:
"""Evaluate a model against target hardware."""
if lang in ("en", "zh"):
set_locale(lang) # type: ignore[arg-type]
# Meta commands short-circuit before requiring model_id + --gpu.
if list_gpus:
render_gpu_list(load_database(), _console)
return
if benchmark:
results = run_all()
render_results(results, _console)
sys.exit(exit_code_from(results))
if not model_id:
_err.print("[red]Missing argument MODEL_ID. Use --help for usage.[/red]")
raise typer.Exit(code=1)
if not gpu:
_err.print("[red]Missing option --gpu. Use --list-gpus to see choices.[/red]")
raise typer.Exit(code=1)
src_obj: ModelSource
src_lower = source.lower()
if src_lower in ("hf", "huggingface"):
src_obj = HuggingFaceSource()
elif src_lower in ("ms", "modelscope"):
src_obj = ModelScopeSource()
else:
_err.print(
f"[red]Unknown --source '{source}'. Use 'huggingface' or 'modelscope'.[/red]"
)
raise typer.Exit(code=1)
evaluator = Evaluator(source=src_obj)
try:
report = evaluator.evaluate(
model_id=model_id,
gpu=gpu,
engine=engine,
gpu_count=gpu_count,
context_length=context_length,
refresh=refresh,
input_tokens=input_tokens,
output_tokens=output_tokens,
target_tokens_per_sec=target_tokens_per_sec,
prefill_utilization=prefill_util,
decode_bw_utilization=decode_bw_util,
concurrency_degradation=concurrency_degradation,
)
except AuthRequiredError as e:
_err.print(f"[bold red]{t('cli.err.auth_required')}[/bold red] {e}")
sys.exit(2)
except ModelNotFoundError as e:
_err.print(f"[bold red]{t('cli.err.model_not_found')}[/bold red] {e}")
sys.exit(3)
except SourceUnavailableError as e:
_err.print(f"[bold red]{t('cli.err.source_unavailable')}[/bold red] {e}")
sys.exit(4)
render(report, _console)
explain_entries = build_explain(report) if (explain or llm_review) else []
if explain:
render_explain(explain_entries, _console)
if llm_review:
# Locale at this point has been resolved by set_locale() calls above.
result = run_review(explain_entries, locale=get_locale())
render_llm_review(result, _console)
if __name__ == "__main__":
app()
|