Spaces:

bluemoonsoldout
/

llm-cal

Running

File size: 7,188 Bytes

cc6274a

"""CLI entry point. Thin shell over `Evaluator` + rich formatter."""

from __future__ import annotations

import sys

import typer
from rich.console import Console

from llm_cal.benchmark.runner import exit_code_from, render_results, run_all
from llm_cal.common.i18n import detect_locale_from_env, get_locale, set_locale, t
from llm_cal.core.evaluator import Evaluator
from llm_cal.core.explain import build as build_explain
from llm_cal.hardware.loader import load_database
from llm_cal.llm_review.reviewer import run_review
from llm_cal.model_source.base import (
    AuthRequiredError,
    ModelNotFoundError,
    ModelSource,
    SourceUnavailableError,
)
from llm_cal.model_source.huggingface import HuggingFaceSource
from llm_cal.model_source.modelscope import ModelScopeSource
from llm_cal.output.formatter import (
    render,
    render_explain,
    render_gpu_list,
    render_llm_review,
)

# Set locale from env first; --lang flag can override inside main()
set_locale(detect_locale_from_env())

app = typer.Typer(
    name="llm-cal",
    help="LLM inference hardware calculator.",
    no_args_is_help=True,
)
_console = Console()
_err = Console(stderr=True)


@app.command()
def main(
    model_id: str | None = typer.Argument(None, help="HuggingFace or ModelScope model id"),
    gpu: str | None = typer.Option(None, "--gpu", help="GPU type, e.g. H800, A100-80G"),
    engine: str = typer.Option("vllm", "--engine", help="Inference engine: vllm | sglang"),
    gpu_count: int | None = typer.Option(
        None, "--gpu-count", help="Force GPU count (otherwise tool recommends)"
    ),
    context_length: int | None = typer.Option(
        None, "--context-length", help="Context length for KV cache estimation"
    ),
    refresh: bool = typer.Option(False, "--refresh", help="Bypass cache and re-fetch"),
    lang: str | None = typer.Option(
        None,
        "--lang",
        help="Output language: en | zh (default auto-detects from LANG env)",
    ),
    list_gpus: bool = typer.Option(
        False,
        "--list-gpus",
        help="List all supported GPUs and exit (no model_id needed)",
    ),
    benchmark: bool = typer.Option(
        False,
        "--benchmark",
        help=(
            "Run the curated benchmark dataset: compare tool output against "
            "reference values from HF API, model cards, vLLM recipes. "
            "Requires network. Exit 0 on all-pass, 1 if any FAIL."
        ),
    ),
    input_tokens: int = typer.Option(
        2000,
        "--input-tokens",
        help="Input token budget for prefill-latency estimation (default: 2000).",
    ),
    output_tokens: int = typer.Option(
        512,
        "--output-tokens",
        help="Output token budget for total-latency math (default: 512).",
    ),
    target_tokens_per_sec: float = typer.Option(
        30.0,
        "--target-tokens-per-sec",
        help="SLA: per-user decode tokens/second (drives L bound). Default: 30.",
    ),
    prefill_util: float = typer.Option(
        0.40,
        "--prefill-util",
        help="Compute utilization factor for prefill (empirical, default 0.40).",
    ),
    decode_bw_util: float = typer.Option(
        0.50,
        "--decode-bw-util",
        help="Memory-bandwidth utilization factor for decode (default 0.50).",
    ),
    concurrency_degradation: float = typer.Option(
        1.0,
        "--concurrency-degradation",
        help=(
            "High-concurrency throughput degradation factor (default 1.0 = "
            "no degradation — the honest baseline). If your engine drops "
            "to 60% efficiency under load, pass 1.67. See docs/methodology.md."
        ),
    ),
    explain: bool = typer.Option(
        False,
        "--explain",
        help=(
            "Print the full derivation trace (formula, inputs, step-by-step, "
            "source) for every non-trivial number. Feed the output to an LLM "
            "if you want a second opinion on the math."
        ),
    ),
    llm_review: bool = typer.Option(
        False,
        "--llm-review",
        help=(
            "EXPERIMENTAL: send the derivation trace to an LLM for a second "
            "opinion. Output is tagged [llm-opinion] and never overrides the "
            "6 primary labels. Requires env vars: LLM_CAL_REVIEWER_API_KEY "
            "(required), LLM_CAL_REVIEWER_BASE_URL (default OpenAI), "
            "LLM_CAL_REVIEWER_MODEL (default gpt-4o)."
        ),
    ),
    source: str = typer.Option(
        "huggingface",
        "--source",
        help=(
            "Model source: huggingface (default) | modelscope. "
            "Auth via HF_TOKEN or MODELSCOPE_API_TOKEN env var."
        ),
    ),
) -> None:
    """Evaluate a model against target hardware."""
    if lang in ("en", "zh"):
        set_locale(lang)  # type: ignore[arg-type]

    # Meta commands short-circuit before requiring model_id + --gpu.
    if list_gpus:
        render_gpu_list(load_database(), _console)
        return

    if benchmark:
        results = run_all()
        render_results(results, _console)
        sys.exit(exit_code_from(results))

    if not model_id:
        _err.print("[red]Missing argument MODEL_ID. Use --help for usage.[/red]")
        raise typer.Exit(code=1)
    if not gpu:
        _err.print("[red]Missing option --gpu. Use --list-gpus to see choices.[/red]")
        raise typer.Exit(code=1)

    src_obj: ModelSource
    src_lower = source.lower()
    if src_lower in ("hf", "huggingface"):
        src_obj = HuggingFaceSource()
    elif src_lower in ("ms", "modelscope"):
        src_obj = ModelScopeSource()
    else:
        _err.print(
            f"[red]Unknown --source '{source}'. Use 'huggingface' or 'modelscope'.[/red]"
        )
        raise typer.Exit(code=1)

    evaluator = Evaluator(source=src_obj)
    try:
        report = evaluator.evaluate(
            model_id=model_id,
            gpu=gpu,
            engine=engine,
            gpu_count=gpu_count,
            context_length=context_length,
            refresh=refresh,
            input_tokens=input_tokens,
            output_tokens=output_tokens,
            target_tokens_per_sec=target_tokens_per_sec,
            prefill_utilization=prefill_util,
            decode_bw_utilization=decode_bw_util,
            concurrency_degradation=concurrency_degradation,
        )
    except AuthRequiredError as e:
        _err.print(f"[bold red]{t('cli.err.auth_required')}[/bold red] {e}")
        sys.exit(2)
    except ModelNotFoundError as e:
        _err.print(f"[bold red]{t('cli.err.model_not_found')}[/bold red] {e}")
        sys.exit(3)
    except SourceUnavailableError as e:
        _err.print(f"[bold red]{t('cli.err.source_unavailable')}[/bold red] {e}")
        sys.exit(4)

    render(report, _console)
    explain_entries = build_explain(report) if (explain or llm_review) else []
    if explain:
        render_explain(explain_entries, _console)
    if llm_review:
        # Locale at this point has been resolved by set_locale() calls above.
        result = run_review(explain_entries, locale=get_locale())
        render_llm_review(result, _console)


if __name__ == "__main__":
    app()