File size: 10,919 Bytes
57b2e21 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 | """Model-switching logic for the interactive CLI's ``/model`` command.
Split out of ``agent.main`` so the REPL dispatcher stays focused on input
parsing. Exposes:
* ``SUGGESTED_MODELS`` β the short list shown by ``/model`` with no arg.
* ``is_valid_model_id`` β loose format check on user input.
* ``probe_and_switch_model`` β async: checks routing, fires a 1-token
probe to resolve the effort cascade, then commits the switch (or
rejects it on hard error).
The probe's cascade lives in ``agent.core.effort_probe``; this module
glues it to CLI output + session state.
"""
from __future__ import annotations
from agent.core.effort_probe import ProbeInconclusive, probe_effort
# Suggested models shown by `/model` (not a gate). Users can paste any HF
# model id (e.g. "MiniMaxAI/MiniMax-M2.7") or an `anthropic/` / `openai/`
# prefix for direct API access. For HF ids, append ":fastest" /
# ":cheapest" / ":preferred" / ":<provider>" to override the default
# routing policy (auto = fastest with failover).
SUGGESTED_MODELS = [
{"id": "openai/gpt-5.5", "label": "GPT-5.5"},
{"id": "openai/gpt-5.4", "label": "GPT-5.4"},
{"id": "anthropic/claude-opus-4-7", "label": "Claude Opus 4.7"},
{"id": "anthropic/claude-opus-4-6", "label": "Claude Opus 4.6"},
{
"id": "bedrock/us.anthropic.claude-opus-4-6-v1",
"label": "Claude Opus 4.6 via Bedrock",
},
{"id": "MiniMaxAI/MiniMax-M2.7", "label": "MiniMax M2.7"},
{"id": "moonshotai/Kimi-K2.6", "label": "Kimi K2.6"},
{"id": "zai-org/GLM-5.1", "label": "GLM 5.1"},
{"id": "deepseek-ai/DeepSeek-V4-Pro:deepinfra", "label": "DeepSeek V4 Pro"},
# Local / self-hosted providers (OpenAI-compatible endpoints)
{"id": "llamacpp/llama-3-8b", "label": "llama.cpp (local)"},
{"id": "lmstudio/llama-3-8b", "label": "LM Studio (local)"},
{"id": "mlx/llama-3-8b", "label": "MLX (Apple Silicon, local)"},
{"id": "nim/llama-3-8b", "label": "NVIDIA NIM (local)"},
{"id": "local/llama-3-8b", "label": "Custom local server"},
{"id": "ollama/llama3.1", "label": "Ollama (local)"},
{"id": "vllm/llama-3-8b", "label": "vLLM (local)"},
{"id": "tgi/llama-3-8b", "label": "TGI (local)"},
]
_ROUTING_POLICIES = {"fastest", "cheapest", "preferred"}
# Local / self-hosted provider prefixes that route to OpenAI-compatible
# endpoints on the user's machine.
_LOCAL_PREFIXES = {
"llamacpp",
"lmstudio",
"mlx",
"nim",
"local",
"ollama",
"vllm",
"tgi",
}
def is_valid_model_id(model_id: str) -> bool:
"""Loose format check β lets users pick any model id.
Accepts:
β’ anthropic/<model>
β’ openai/<model>
β’ <org>/<model>[:<tag>] (HF router; tag = provider or policy)
β’ huggingface/<org>/<model>[:<tag>] (same, accepts legacy prefix)
β’ <local-prefix>/<model> (local OpenAI-compatible server)
Actual availability is verified against the HF router catalog on
switch, and by the provider on the probe's ping call.
"""
if not model_id or "/" not in model_id:
return False
head = model_id.split(":", 1)[0]
parts = head.split("/")
if len(parts) >= 2 and all(parts):
return True
# Local prefixes only need one part after the prefix (e.g. "ollama/llama3")
if parts[0] in _LOCAL_PREFIXES and len(parts) == 2 and parts[1]:
return True
return False
def _print_hf_routing_info(model_id: str, console) -> bool:
"""Show HF router catalog info (providers, price, context, tool support)
for an HF-router model id. Returns ``True`` to signal the caller can
proceed with the switch, ``False`` to indicate a hard problem the user
should notice before we fire the effort probe.
Anthropic / OpenAI ids return ``True`` without printing anything β
the probe below covers "does this model exist".
"""
if model_id.startswith(("anthropic/", "openai/")):
return True
# Local providers bypass the HF router catalog entirely.
if model_id.split("/", 1)[0] in _LOCAL_PREFIXES:
return True
from agent.core import hf_router_catalog as cat
bare, _, tag = model_id.partition(":")
info = cat.lookup(bare)
if info is None:
console.print(
f"[bold red]Warning:[/bold red] '{bare}' isn't in the HF router "
"catalog. Checking anyway β first call may fail."
)
suggestions = cat.fuzzy_suggest(bare)
if suggestions:
console.print(f"[dim]Did you mean: {', '.join(suggestions)}[/dim]")
return True
live = info.live_providers
if not live:
console.print(
f"[bold red]Warning:[/bold red] '{bare}' has no live providers "
"right now. First call will likely fail."
)
return True
if tag and tag not in _ROUTING_POLICIES:
matched = [p for p in live if p.provider == tag]
if not matched:
names = ", ".join(p.provider for p in live)
console.print(
f"[bold red]Warning:[/bold red] provider '{tag}' doesn't serve "
f"'{bare}'. Live providers: {names}. Checking anyway."
)
if not info.any_supports_tools:
console.print(
f"[bold red]Warning:[/bold red] no provider for '{bare}' advertises "
"tool-call support. This agent relies on tool calls β expect errors."
)
if tag in _ROUTING_POLICIES:
policy = tag
elif tag:
policy = f"pinned to {tag}"
else:
policy = "auto (fastest)"
console.print(f" [dim]routing: {policy}[/dim]")
for p in live:
price = (
f"${p.input_price:g}/${p.output_price:g} per M tok"
if p.input_price is not None and p.output_price is not None
else "price n/a"
)
ctx = f"{p.context_length:,} ctx" if p.context_length else "ctx n/a"
tools = "tools" if p.supports_tools else "no tools"
console.print(f" [dim]{p.provider}: {price}, {ctx}, {tools}[/dim]")
return True
def print_model_listing(config, console) -> None:
"""Render the default ``/model`` (no-arg) view: current + suggested."""
current = config.model_name if config else ""
console.print("[bold]Current model:[/bold]")
console.print(f" {current}")
console.print("\n[bold]Suggested:[/bold]")
for m in SUGGESTED_MODELS:
marker = " [dim]<-- current[/dim]" if m["id"] == current else ""
console.print(f" {m['id']} [dim]({m['label']})[/dim]{marker}")
console.print(
"\n[dim]Paste any HF model id (e.g. 'MiniMaxAI/MiniMax-M2.7').\n"
"Add ':fastest', ':cheapest', ':preferred', or ':<provider>' to override routing.\n"
"Use 'anthropic/<model>' or 'openai/<model>' for direct API access.[/dim]"
)
def print_invalid_id(arg: str, console) -> None:
console.print(f"[bold red]Invalid model id format:[/bold red] {arg}")
console.print(
"[dim]Expected:\n"
" β’ <org>/<model>[:tag] (HF router β paste from huggingface.co)\n"
" β’ anthropic/<model>\n"
" β’ openai/<model>\n"
" β’ llamacpp/<model> (llama.cpp server, http://localhost:8080)\n"
" β’ lmstudio/<model> (LM Studio, http://localhost:1234)\n"
" β’ mlx/<model> (MLX OpenAI-compatible server)\n"
" β’ nim/<model> (NVIDIA NIM, http://localhost:8000)\n"
" β’ ollama/<model> (Ollama, http://localhost:11434)\n"
" β’ vllm/<model> (vLLM, http://localhost:8000)\n"
" β’ tgi/<model> (TGI, http://localhost:8080)\n"
" β’ local/<model> (custom LOCAL_API_BASE)[/dim]"
)
async def probe_and_switch_model(
model_id: str,
config,
session,
console,
hf_token: str | None,
) -> None:
"""Validate model+effort with a 1-token ping, cache the effective effort,
then commit the switch.
Three visible outcomes:
* β ``effort: <level>`` β model accepted the preferred effort (or a
fallback from the cascade; the note explains if so)
* β ``effort: off`` β model doesn't support thinking; we'll strip it
* β hard error (auth, model-not-found, quota) β we reject the switch
and keep the current model so the user isn't stranded
Transient errors (5xx, timeout) complete the switch with a yellow
warning; the next real call re-surfaces the error if it's persistent.
"""
preference = config.reasoning_effort
if not _print_hf_routing_info(model_id, console):
return
if not preference:
# Nothing to validate with a ping that we couldn't validate on the
# first real call just as cheaply. Skip the probe entirely.
_commit_switch(model_id, config, session, effective=None, cache=False)
console.print(
f"[green]Model switched to {model_id}[/green] [dim](effort: off)[/dim]"
)
return
console.print(f"[dim]checking {model_id} (effort: {preference})...[/dim]")
try:
outcome = await probe_effort(model_id, preference, hf_token, session=session)
except ProbeInconclusive as e:
_commit_switch(model_id, config, session, effective=None, cache=False)
console.print(
f"[yellow]Model switched to {model_id}[/yellow] "
f"[dim](couldn't validate: {e}; will verify on first message)[/dim]"
)
return
except Exception as e:
# Hard persistent error β auth, unknown model, quota. Don't switch.
console.print(f"[bold red]Switch failed:[/bold red] {e}")
console.print(f"[dim]Keeping current model: {config.model_name}[/dim]")
return
_commit_switch(
model_id,
config,
session,
effective=outcome.effective_effort,
cache=True,
)
effort_label = outcome.effective_effort or "off"
suffix = f" β {outcome.note}" if outcome.note else ""
console.print(
f"[green]Model switched to {model_id}[/green] "
f"[dim](effort: {effort_label}{suffix}, {outcome.elapsed_ms}ms)[/dim]"
)
def _commit_switch(model_id, config, session, effective, cache: bool) -> None:
"""Apply the switch to the session (or bare config if no session yet).
``effective`` is the probe's resolved effort; ``cache=True`` stores it
in the session's per-model cache so real calls use the resolved level
instead of re-probing. ``cache=False`` (inconclusive probe / effort
off) leaves the cache untouched β next call falls back to preference.
"""
if session is not None:
session.update_model(model_id)
if cache:
session.model_effective_effort[model_id] = effective
else:
session.model_effective_effort.pop(model_id, None)
else:
config.model_name = model_id
|