feat: add local model provider support to llm_params.py
Browse files- agent/core/llm_params.py +236 -0
agent/core/llm_params.py
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""LiteLLM kwargs resolution for the model ids this agent accepts.
|
| 2 |
+
|
| 3 |
+
Kept separate from ``agent_loop`` so tools (research, context compaction, etc.)
|
| 4 |
+
can import it without pulling in the whole agent loop / tool router and
|
| 5 |
+
creating circular imports.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
|
| 10 |
+
from agent.core.hf_tokens import get_hf_bill_to, resolve_hf_router_token
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _resolve_hf_router_token(session_hf_token: str | None = None) -> str | None:
|
| 14 |
+
"""Backward-compatible private wrapper used by tests and older imports."""
|
| 15 |
+
return resolve_hf_router_token(session_hf_token)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def _patch_litellm_effort_validation() -> None:
|
| 19 |
+
"""Neuter LiteLLM 1.83's hardcoded effort-level validation.
|
| 20 |
+
|
| 21 |
+
Context: at ``litellm/llms/anthropic/chat/transformation.py:~1443`` the
|
| 22 |
+
Anthropic adapter validates ``output_config.effort β {high, medium,
|
| 23 |
+
low, max}`` and gates ``max`` behind an ``_is_opus_4_6_model`` check
|
| 24 |
+
that only matches the substring ``opus-4-6`` / ``opus_4_6``. Result:
|
| 25 |
+
|
| 26 |
+
* ``xhigh`` β valid on Anthropic's real API for Claude 4.7 β is
|
| 27 |
+
rejected pre-flight with "Invalid effort value: xhigh".
|
| 28 |
+
* ``max`` on Opus 4.7 is rejected with "effort='max' is only supported
|
| 29 |
+
by Claude Opus 4.6", even though Opus 4.7 accepts it in practice.
|
| 30 |
+
|
| 31 |
+
We don't want to maintain a parallel model table, so we let the
|
| 32 |
+
Anthropic API itself be the validator: widen ``_is_opus_4_6_model``
|
| 33 |
+
to also match ``opus-4-7``+ families, and drop the valid-effort-set
|
| 34 |
+
check entirely. If Anthropic rejects an effort level, we see a 400
|
| 35 |
+
and the cascade walks down β exactly the behavior we want for any
|
| 36 |
+
future model family.
|
| 37 |
+
|
| 38 |
+
Removable once litellm ships 1.83.8-stable (which merges PR #25867,
|
| 39 |
+
"Litellm day 0 opus 4.7 support") β see commit 0868a82 on their main
|
| 40 |
+
branch. Until then, this one-time patch is the escape hatch.
|
| 41 |
+
"""
|
| 42 |
+
try:
|
| 43 |
+
from litellm.llms.anthropic.chat import transformation as _t
|
| 44 |
+
except Exception:
|
| 45 |
+
return
|
| 46 |
+
|
| 47 |
+
cfg = getattr(_t, "AnthropicConfig", None)
|
| 48 |
+
if cfg is None:
|
| 49 |
+
return
|
| 50 |
+
|
| 51 |
+
original = getattr(cfg, "_is_opus_4_6_model", None)
|
| 52 |
+
if original is None or getattr(original, "_hf_agent_patched", False):
|
| 53 |
+
return
|
| 54 |
+
|
| 55 |
+
def _widened(model: str) -> bool:
|
| 56 |
+
m = model.lower()
|
| 57 |
+
# Original 4.6 match plus any future Opus >= 4.6. We only need this
|
| 58 |
+
# to return True for families where "max" / "xhigh" are acceptable
|
| 59 |
+
# at the API; the cascade handles the case when they're not.
|
| 60 |
+
return any(
|
| 61 |
+
v in m
|
| 62 |
+
for v in (
|
| 63 |
+
"opus-4-6",
|
| 64 |
+
"opus_4_6",
|
| 65 |
+
"opus-4.6",
|
| 66 |
+
"opus_4.6",
|
| 67 |
+
"opus-4-7",
|
| 68 |
+
"opus_4_7",
|
| 69 |
+
"opus-4.7",
|
| 70 |
+
"opus_4.7",
|
| 71 |
+
)
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
_widened._hf_agent_patched = True # type: ignore[attr-defined]
|
| 75 |
+
cfg._is_opus_4_6_model = staticmethod(_widened)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
_patch_litellm_effort_validation()
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
# Effort levels accepted on the wire.
|
| 82 |
+
# Anthropic (4.6+): low | medium | high | xhigh | max (output_config.effort)
|
| 83 |
+
# OpenAI direct: minimal | low | medium | high | xhigh (reasoning_effort top-level)
|
| 84 |
+
# HF router: low | medium | high (extra_body.reasoning_effort)
|
| 85 |
+
#
|
| 86 |
+
# We validate *shape* here and let the probe cascade walk down on rejection;
|
| 87 |
+
# we deliberately do NOT maintain a per-model capability table.
|
| 88 |
+
_ANTHROPIC_EFFORTS = {"low", "medium", "high", "xhigh", "max"}
|
| 89 |
+
_OPENAI_EFFORTS = {"minimal", "low", "medium", "high", "xhigh"}
|
| 90 |
+
_HF_EFFORTS = {"low", "medium", "high"}
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
class UnsupportedEffortError(ValueError):
|
| 94 |
+
"""The requested effort isn't valid for this provider's API surface.
|
| 95 |
+
|
| 96 |
+
Raised synchronously before any network call so the probe cascade can
|
| 97 |
+
skip levels the provider can't accept (e.g. ``max`` on HF router).
|
| 98 |
+
"""
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def _resolve_llm_params(
|
| 102 |
+
model_name: str,
|
| 103 |
+
session_hf_token: str | None = None,
|
| 104 |
+
reasoning_effort: str | None = None,
|
| 105 |
+
strict: bool = False,
|
| 106 |
+
) -> dict:
|
| 107 |
+
"""
|
| 108 |
+
Build LiteLLM kwargs for a given model id.
|
| 109 |
+
|
| 110 |
+
β’ ``anthropic/<model>`` β native thinking config. We bypass LiteLLM's
|
| 111 |
+
``reasoning_effort`` β ``thinking`` mapping (which lags new Claude
|
| 112 |
+
releases like 4.7 and sends the wrong API shape). Instead we pass
|
| 113 |
+
both ``thinking={"type": "adaptive"}`` and ``output_config=
|
| 114 |
+
{"effort": <level>}`` as top-level kwargs β LiteLLM's Anthropic
|
| 115 |
+
adapter forwards unknown top-level kwargs into the request body
|
| 116 |
+
verbatim (confirmed by live probe; ``extra_body`` does NOT work
|
| 117 |
+
here because Anthropic's API rejects it as "Extra inputs are not
|
| 118 |
+
permitted"). This is the stable API for 4.6 and 4.7. Older
|
| 119 |
+
extended-thinking models that only accept ``thinking.type.enabled``
|
| 120 |
+
will reject this; the probe's cascade catches that and falls back
|
| 121 |
+
to no thinking.
|
| 122 |
+
|
| 123 |
+
β’ ``openai/<model>`` β ``reasoning_effort`` forwarded as a top-level
|
| 124 |
+
kwarg (GPT-5 / o-series). LiteLLM uses the user's ``OPENAI_API_KEY``.
|
| 125 |
+
|
| 126 |
+
β’ Anything else is treated as a HuggingFace router id. We hit the
|
| 127 |
+
auto-routing OpenAI-compatible endpoint at
|
| 128 |
+
``https://router.huggingface.co/v1``. The id can be bare or carry an
|
| 129 |
+
HF routing suffix (``:fastest`` / ``:cheapest`` / ``:<provider>``).
|
| 130 |
+
A leading ``huggingface/`` is stripped. ``reasoning_effort`` is
|
| 131 |
+
forwarded via ``extra_body`` (LiteLLM's OpenAI adapter refuses it as
|
| 132 |
+
a top-level kwarg for non-OpenAI models). "minimal" normalizes to
|
| 133 |
+
"low".
|
| 134 |
+
|
| 135 |
+
``strict=True`` raises ``UnsupportedEffortError`` when the requested
|
| 136 |
+
effort isn't in the provider's accepted set, instead of silently
|
| 137 |
+
dropping it. The probe cascade uses strict mode so it can walk down
|
| 138 |
+
(``max`` β ``xhigh`` β ``high`` β¦) without making an API call. Regular
|
| 139 |
+
runtime callers leave ``strict=False``, so a stale cached effort
|
| 140 |
+
can't crash a turn β it just doesn't get sent.
|
| 141 |
+
|
| 142 |
+
Token precedence (first non-empty wins):
|
| 143 |
+
1. INFERENCE_TOKEN env β shared key on the hosted Space (inference is
|
| 144 |
+
free for users, billed to the Space owner via ``X-HF-Bill-To``).
|
| 145 |
+
2. session.hf_token β the user's own token (CLI / OAuth / cache file).
|
| 146 |
+
3. huggingface_hub cache β ``HF_TOKEN`` / ``HUGGING_FACE_HUB_TOKEN`` /
|
| 147 |
+
local ``hf auth login`` cache.
|
| 148 |
+
"""
|
| 149 |
+
if model_name.startswith("anthropic/"):
|
| 150 |
+
params: dict = {"model": model_name}
|
| 151 |
+
if reasoning_effort:
|
| 152 |
+
level = reasoning_effort
|
| 153 |
+
if level == "minimal":
|
| 154 |
+
level = "low"
|
| 155 |
+
if level not in _ANTHROPIC_EFFORTS:
|
| 156 |
+
if strict:
|
| 157 |
+
raise UnsupportedEffortError(
|
| 158 |
+
f"Anthropic doesn't accept effort={level!r}"
|
| 159 |
+
)
|
| 160 |
+
else:
|
| 161 |
+
# Adaptive thinking + output_config.effort is the stable
|
| 162 |
+
# Anthropic API for Claude 4.6 / 4.7. Both kwargs are
|
| 163 |
+
# passed top-level: LiteLLM forwards unknown params into
|
| 164 |
+
# the request body for Anthropic, so ``output_config``
|
| 165 |
+
# reaches the API. ``extra_body`` does NOT work here β
|
| 166 |
+
# Anthropic rejects it as "Extra inputs are not
|
| 167 |
+
# permitted".
|
| 168 |
+
params["thinking"] = {"type": "adaptive"}
|
| 169 |
+
params["output_config"] = {"effort": level}
|
| 170 |
+
return params
|
| 171 |
+
|
| 172 |
+
if model_name.startswith("bedrock/"):
|
| 173 |
+
# LiteLLM routes ``bedrock/...`` through the Converse adapter, which
|
| 174 |
+
# picks up AWS credentials from the standard env vars
|
| 175 |
+
# (``AWS_ACCESS_KEY_ID`` / ``AWS_SECRET_ACCESS_KEY`` / ``AWS_REGION``).
|
| 176 |
+
# The Anthropic thinking/effort shape is not forwarded through Converse
|
| 177 |
+
# the same way, so we leave it off for now.
|
| 178 |
+
return {"model": model_name}
|
| 179 |
+
|
| 180 |
+
# ββ Local / self-hosted providers (OpenAI-compatible endpoints) ββββββββββ
|
| 181 |
+
# These prefixes route to local inference servers. LiteLLM's ``openai/``
|
| 182 |
+
# adapter is used with a custom ``api_base`` (and optionally ``api_key``).
|
| 183 |
+
# Reasoning effort is skipped β local servers rarely support it.
|
| 184 |
+
_LOCAL_PROVIDERS: dict[str, tuple[str, str | None]] = {
|
| 185 |
+
# prefix β (default api_base, env var for api_base override)
|
| 186 |
+
"llamacpp/": ("http://localhost:8080/v1", "LLAMACPP_API_BASE"),
|
| 187 |
+
"lmstudio/": ("http://localhost:1234/v1", "LMSTUDIO_API_BASE"),
|
| 188 |
+
"mlx/": ("http://localhost:8000/v1", "MLX_API_BASE"),
|
| 189 |
+
"nim/": ("http://localhost:8000/v1", "NIM_API_BASE"),
|
| 190 |
+
"local/": ("http://localhost:8000/v1", "LOCAL_API_BASE"),
|
| 191 |
+
"ollama/": ("http://localhost:11434/v1", "OLLAMA_API_BASE"),
|
| 192 |
+
"vllm/": ("http://localhost:8000/v1", "VLLM_API_BASE"),
|
| 193 |
+
"tgi/": ("http://localhost:8080/v1", "TGI_API_BASE"),
|
| 194 |
+
}
|
| 195 |
+
for prefix, (default_base, env_override) in _LOCAL_PROVIDERS.items():
|
| 196 |
+
if model_name.startswith(prefix):
|
| 197 |
+
api_base = os.environ.get(env_override, default_base)
|
| 198 |
+
api_key = os.environ.get("LOCAL_API_KEY", "no-key")
|
| 199 |
+
return {
|
| 200 |
+
"model": f"openai/{model_name.removeprefix(prefix)}",
|
| 201 |
+
"api_base": api_base,
|
| 202 |
+
"api_key": api_key,
|
| 203 |
+
}
|
| 204 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 205 |
+
|
| 206 |
+
if model_name.startswith("openai/"):
|
| 207 |
+
params = {"model": model_name}
|
| 208 |
+
if reasoning_effort:
|
| 209 |
+
if reasoning_effort not in _OPENAI_EFFORTS:
|
| 210 |
+
if strict:
|
| 211 |
+
raise UnsupportedEffortError(
|
| 212 |
+
f"OpenAI doesn't accept effort={reasoning_effort!r}"
|
| 213 |
+
)
|
| 214 |
+
else:
|
| 215 |
+
params["reasoning_effort"] = reasoning_effort
|
| 216 |
+
return params
|
| 217 |
+
|
| 218 |
+
hf_model = model_name.removeprefix("huggingface/")
|
| 219 |
+
api_key = _resolve_hf_router_token(session_hf_token)
|
| 220 |
+
params = {
|
| 221 |
+
"model": f"openai/{hf_model}",
|
| 222 |
+
"api_base": "https://router.huggingface.co/v1",
|
| 223 |
+
"api_key": api_key,
|
| 224 |
+
}
|
| 225 |
+
if bill_to := get_hf_bill_to():
|
| 226 |
+
params["extra_headers"] = {"X-HF-Bill-To": bill_to}
|
| 227 |
+
if reasoning_effort:
|
| 228 |
+
hf_level = "low" if reasoning_effort == "minimal" else reasoning_effort
|
| 229 |
+
if hf_level not in _HF_EFFORTS:
|
| 230 |
+
if strict:
|
| 231 |
+
raise UnsupportedEffortError(
|
| 232 |
+
f"HF router doesn't accept effort={hf_level!r}"
|
| 233 |
+
)
|
| 234 |
+
else:
|
| 235 |
+
params["extra_body"] = {"reasoning_effort": hf_level}
|
| 236 |
+
return params
|