File size: 7,188 Bytes
cc6274a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
"""CLI entry point. Thin shell over `Evaluator` + rich formatter."""

from __future__ import annotations

import sys

import typer
from rich.console import Console

from llm_cal.benchmark.runner import exit_code_from, render_results, run_all
from llm_cal.common.i18n import detect_locale_from_env, get_locale, set_locale, t
from llm_cal.core.evaluator import Evaluator
from llm_cal.core.explain import build as build_explain
from llm_cal.hardware.loader import load_database
from llm_cal.llm_review.reviewer import run_review
from llm_cal.model_source.base import (
    AuthRequiredError,
    ModelNotFoundError,
    ModelSource,
    SourceUnavailableError,
)
from llm_cal.model_source.huggingface import HuggingFaceSource
from llm_cal.model_source.modelscope import ModelScopeSource
from llm_cal.output.formatter import (
    render,
    render_explain,
    render_gpu_list,
    render_llm_review,
)

# Set locale from env first; --lang flag can override inside main()
set_locale(detect_locale_from_env())

app = typer.Typer(
    name="llm-cal",
    help="LLM inference hardware calculator.",
    no_args_is_help=True,
)
_console = Console()
_err = Console(stderr=True)


@app.command()
def main(
    model_id: str | None = typer.Argument(None, help="HuggingFace or ModelScope model id"),
    gpu: str | None = typer.Option(None, "--gpu", help="GPU type, e.g. H800, A100-80G"),
    engine: str = typer.Option("vllm", "--engine", help="Inference engine: vllm | sglang"),
    gpu_count: int | None = typer.Option(
        None, "--gpu-count", help="Force GPU count (otherwise tool recommends)"
    ),
    context_length: int | None = typer.Option(
        None, "--context-length", help="Context length for KV cache estimation"
    ),
    refresh: bool = typer.Option(False, "--refresh", help="Bypass cache and re-fetch"),
    lang: str | None = typer.Option(
        None,
        "--lang",
        help="Output language: en | zh (default auto-detects from LANG env)",
    ),
    list_gpus: bool = typer.Option(
        False,
        "--list-gpus",
        help="List all supported GPUs and exit (no model_id needed)",
    ),
    benchmark: bool = typer.Option(
        False,
        "--benchmark",
        help=(
            "Run the curated benchmark dataset: compare tool output against "
            "reference values from HF API, model cards, vLLM recipes. "
            "Requires network. Exit 0 on all-pass, 1 if any FAIL."
        ),
    ),
    input_tokens: int = typer.Option(
        2000,
        "--input-tokens",
        help="Input token budget for prefill-latency estimation (default: 2000).",
    ),
    output_tokens: int = typer.Option(
        512,
        "--output-tokens",
        help="Output token budget for total-latency math (default: 512).",
    ),
    target_tokens_per_sec: float = typer.Option(
        30.0,
        "--target-tokens-per-sec",
        help="SLA: per-user decode tokens/second (drives L bound). Default: 30.",
    ),
    prefill_util: float = typer.Option(
        0.40,
        "--prefill-util",
        help="Compute utilization factor for prefill (empirical, default 0.40).",
    ),
    decode_bw_util: float = typer.Option(
        0.50,
        "--decode-bw-util",
        help="Memory-bandwidth utilization factor for decode (default 0.50).",
    ),
    concurrency_degradation: float = typer.Option(
        1.0,
        "--concurrency-degradation",
        help=(
            "High-concurrency throughput degradation factor (default 1.0 = "
            "no degradation — the honest baseline). If your engine drops "
            "to 60% efficiency under load, pass 1.67. See docs/methodology.md."
        ),
    ),
    explain: bool = typer.Option(
        False,
        "--explain",
        help=(
            "Print the full derivation trace (formula, inputs, step-by-step, "
            "source) for every non-trivial number. Feed the output to an LLM "
            "if you want a second opinion on the math."
        ),
    ),
    llm_review: bool = typer.Option(
        False,
        "--llm-review",
        help=(
            "EXPERIMENTAL: send the derivation trace to an LLM for a second "
            "opinion. Output is tagged [llm-opinion] and never overrides the "
            "6 primary labels. Requires env vars: LLM_CAL_REVIEWER_API_KEY "
            "(required), LLM_CAL_REVIEWER_BASE_URL (default OpenAI), "
            "LLM_CAL_REVIEWER_MODEL (default gpt-4o)."
        ),
    ),
    source: str = typer.Option(
        "huggingface",
        "--source",
        help=(
            "Model source: huggingface (default) | modelscope. "
            "Auth via HF_TOKEN or MODELSCOPE_API_TOKEN env var."
        ),
    ),
) -> None:
    """Evaluate a model against target hardware."""
    if lang in ("en", "zh"):
        set_locale(lang)  # type: ignore[arg-type]

    # Meta commands short-circuit before requiring model_id + --gpu.
    if list_gpus:
        render_gpu_list(load_database(), _console)
        return

    if benchmark:
        results = run_all()
        render_results(results, _console)
        sys.exit(exit_code_from(results))

    if not model_id:
        _err.print("[red]Missing argument MODEL_ID. Use --help for usage.[/red]")
        raise typer.Exit(code=1)
    if not gpu:
        _err.print("[red]Missing option --gpu. Use --list-gpus to see choices.[/red]")
        raise typer.Exit(code=1)

    src_obj: ModelSource
    src_lower = source.lower()
    if src_lower in ("hf", "huggingface"):
        src_obj = HuggingFaceSource()
    elif src_lower in ("ms", "modelscope"):
        src_obj = ModelScopeSource()
    else:
        _err.print(
            f"[red]Unknown --source '{source}'. Use 'huggingface' or 'modelscope'.[/red]"
        )
        raise typer.Exit(code=1)

    evaluator = Evaluator(source=src_obj)
    try:
        report = evaluator.evaluate(
            model_id=model_id,
            gpu=gpu,
            engine=engine,
            gpu_count=gpu_count,
            context_length=context_length,
            refresh=refresh,
            input_tokens=input_tokens,
            output_tokens=output_tokens,
            target_tokens_per_sec=target_tokens_per_sec,
            prefill_utilization=prefill_util,
            decode_bw_utilization=decode_bw_util,
            concurrency_degradation=concurrency_degradation,
        )
    except AuthRequiredError as e:
        _err.print(f"[bold red]{t('cli.err.auth_required')}[/bold red] {e}")
        sys.exit(2)
    except ModelNotFoundError as e:
        _err.print(f"[bold red]{t('cli.err.model_not_found')}[/bold red] {e}")
        sys.exit(3)
    except SourceUnavailableError as e:
        _err.print(f"[bold red]{t('cli.err.source_unavailable')}[/bold red] {e}")
        sys.exit(4)

    render(report, _console)
    explain_entries = build_explain(report) if (explain or llm_review) else []
    if explain:
        render_explain(explain_entries, _console)
    if llm_review:
        # Locale at this point has been resolved by set_locale() calls above.
        result = run_review(explain_entries, locale=get_locale())
        render_llm_review(result, _console)


if __name__ == "__main__":
    app()