raazkumar commited on
Commit
286afc5
Β·
verified Β·
1 Parent(s): a88d431

feat: add local model provider support to llm_params.py

Browse files
Files changed (1) hide show
  1. agent/core/llm_params.py +236 -0
agent/core/llm_params.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """LiteLLM kwargs resolution for the model ids this agent accepts.
2
+
3
+ Kept separate from ``agent_loop`` so tools (research, context compaction, etc.)
4
+ can import it without pulling in the whole agent loop / tool router and
5
+ creating circular imports.
6
+ """
7
+
8
+ import os
9
+
10
+ from agent.core.hf_tokens import get_hf_bill_to, resolve_hf_router_token
11
+
12
+
13
+ def _resolve_hf_router_token(session_hf_token: str | None = None) -> str | None:
14
+ """Backward-compatible private wrapper used by tests and older imports."""
15
+ return resolve_hf_router_token(session_hf_token)
16
+
17
+
18
+ def _patch_litellm_effort_validation() -> None:
19
+ """Neuter LiteLLM 1.83's hardcoded effort-level validation.
20
+
21
+ Context: at ``litellm/llms/anthropic/chat/transformation.py:~1443`` the
22
+ Anthropic adapter validates ``output_config.effort ∈ {high, medium,
23
+ low, max}`` and gates ``max`` behind an ``_is_opus_4_6_model`` check
24
+ that only matches the substring ``opus-4-6`` / ``opus_4_6``. Result:
25
+
26
+ * ``xhigh`` β€” valid on Anthropic's real API for Claude 4.7 β€” is
27
+ rejected pre-flight with "Invalid effort value: xhigh".
28
+ * ``max`` on Opus 4.7 is rejected with "effort='max' is only supported
29
+ by Claude Opus 4.6", even though Opus 4.7 accepts it in practice.
30
+
31
+ We don't want to maintain a parallel model table, so we let the
32
+ Anthropic API itself be the validator: widen ``_is_opus_4_6_model``
33
+ to also match ``opus-4-7``+ families, and drop the valid-effort-set
34
+ check entirely. If Anthropic rejects an effort level, we see a 400
35
+ and the cascade walks down β€” exactly the behavior we want for any
36
+ future model family.
37
+
38
+ Removable once litellm ships 1.83.8-stable (which merges PR #25867,
39
+ "Litellm day 0 opus 4.7 support") β€” see commit 0868a82 on their main
40
+ branch. Until then, this one-time patch is the escape hatch.
41
+ """
42
+ try:
43
+ from litellm.llms.anthropic.chat import transformation as _t
44
+ except Exception:
45
+ return
46
+
47
+ cfg = getattr(_t, "AnthropicConfig", None)
48
+ if cfg is None:
49
+ return
50
+
51
+ original = getattr(cfg, "_is_opus_4_6_model", None)
52
+ if original is None or getattr(original, "_hf_agent_patched", False):
53
+ return
54
+
55
+ def _widened(model: str) -> bool:
56
+ m = model.lower()
57
+ # Original 4.6 match plus any future Opus >= 4.6. We only need this
58
+ # to return True for families where "max" / "xhigh" are acceptable
59
+ # at the API; the cascade handles the case when they're not.
60
+ return any(
61
+ v in m
62
+ for v in (
63
+ "opus-4-6",
64
+ "opus_4_6",
65
+ "opus-4.6",
66
+ "opus_4.6",
67
+ "opus-4-7",
68
+ "opus_4_7",
69
+ "opus-4.7",
70
+ "opus_4.7",
71
+ )
72
+ )
73
+
74
+ _widened._hf_agent_patched = True # type: ignore[attr-defined]
75
+ cfg._is_opus_4_6_model = staticmethod(_widened)
76
+
77
+
78
+ _patch_litellm_effort_validation()
79
+
80
+
81
+ # Effort levels accepted on the wire.
82
+ # Anthropic (4.6+): low | medium | high | xhigh | max (output_config.effort)
83
+ # OpenAI direct: minimal | low | medium | high | xhigh (reasoning_effort top-level)
84
+ # HF router: low | medium | high (extra_body.reasoning_effort)
85
+ #
86
+ # We validate *shape* here and let the probe cascade walk down on rejection;
87
+ # we deliberately do NOT maintain a per-model capability table.
88
+ _ANTHROPIC_EFFORTS = {"low", "medium", "high", "xhigh", "max"}
89
+ _OPENAI_EFFORTS = {"minimal", "low", "medium", "high", "xhigh"}
90
+ _HF_EFFORTS = {"low", "medium", "high"}
91
+
92
+
93
+ class UnsupportedEffortError(ValueError):
94
+ """The requested effort isn't valid for this provider's API surface.
95
+
96
+ Raised synchronously before any network call so the probe cascade can
97
+ skip levels the provider can't accept (e.g. ``max`` on HF router).
98
+ """
99
+
100
+
101
+ def _resolve_llm_params(
102
+ model_name: str,
103
+ session_hf_token: str | None = None,
104
+ reasoning_effort: str | None = None,
105
+ strict: bool = False,
106
+ ) -> dict:
107
+ """
108
+ Build LiteLLM kwargs for a given model id.
109
+
110
+ β€’ ``anthropic/<model>`` β€” native thinking config. We bypass LiteLLM's
111
+ ``reasoning_effort`` β†’ ``thinking`` mapping (which lags new Claude
112
+ releases like 4.7 and sends the wrong API shape). Instead we pass
113
+ both ``thinking={"type": "adaptive"}`` and ``output_config=
114
+ {"effort": <level>}`` as top-level kwargs β€” LiteLLM's Anthropic
115
+ adapter forwards unknown top-level kwargs into the request body
116
+ verbatim (confirmed by live probe; ``extra_body`` does NOT work
117
+ here because Anthropic's API rejects it as "Extra inputs are not
118
+ permitted"). This is the stable API for 4.6 and 4.7. Older
119
+ extended-thinking models that only accept ``thinking.type.enabled``
120
+ will reject this; the probe's cascade catches that and falls back
121
+ to no thinking.
122
+
123
+ β€’ ``openai/<model>`` β€” ``reasoning_effort`` forwarded as a top-level
124
+ kwarg (GPT-5 / o-series). LiteLLM uses the user's ``OPENAI_API_KEY``.
125
+
126
+ β€’ Anything else is treated as a HuggingFace router id. We hit the
127
+ auto-routing OpenAI-compatible endpoint at
128
+ ``https://router.huggingface.co/v1``. The id can be bare or carry an
129
+ HF routing suffix (``:fastest`` / ``:cheapest`` / ``:<provider>``).
130
+ A leading ``huggingface/`` is stripped. ``reasoning_effort`` is
131
+ forwarded via ``extra_body`` (LiteLLM's OpenAI adapter refuses it as
132
+ a top-level kwarg for non-OpenAI models). "minimal" normalizes to
133
+ "low".
134
+
135
+ ``strict=True`` raises ``UnsupportedEffortError`` when the requested
136
+ effort isn't in the provider's accepted set, instead of silently
137
+ dropping it. The probe cascade uses strict mode so it can walk down
138
+ (``max`` β†’ ``xhigh`` β†’ ``high`` …) without making an API call. Regular
139
+ runtime callers leave ``strict=False``, so a stale cached effort
140
+ can't crash a turn β€” it just doesn't get sent.
141
+
142
+ Token precedence (first non-empty wins):
143
+ 1. INFERENCE_TOKEN env β€” shared key on the hosted Space (inference is
144
+ free for users, billed to the Space owner via ``X-HF-Bill-To``).
145
+ 2. session.hf_token β€” the user's own token (CLI / OAuth / cache file).
146
+ 3. huggingface_hub cache β€” ``HF_TOKEN`` / ``HUGGING_FACE_HUB_TOKEN`` /
147
+ local ``hf auth login`` cache.
148
+ """
149
+ if model_name.startswith("anthropic/"):
150
+ params: dict = {"model": model_name}
151
+ if reasoning_effort:
152
+ level = reasoning_effort
153
+ if level == "minimal":
154
+ level = "low"
155
+ if level not in _ANTHROPIC_EFFORTS:
156
+ if strict:
157
+ raise UnsupportedEffortError(
158
+ f"Anthropic doesn't accept effort={level!r}"
159
+ )
160
+ else:
161
+ # Adaptive thinking + output_config.effort is the stable
162
+ # Anthropic API for Claude 4.6 / 4.7. Both kwargs are
163
+ # passed top-level: LiteLLM forwards unknown params into
164
+ # the request body for Anthropic, so ``output_config``
165
+ # reaches the API. ``extra_body`` does NOT work here β€”
166
+ # Anthropic rejects it as "Extra inputs are not
167
+ # permitted".
168
+ params["thinking"] = {"type": "adaptive"}
169
+ params["output_config"] = {"effort": level}
170
+ return params
171
+
172
+ if model_name.startswith("bedrock/"):
173
+ # LiteLLM routes ``bedrock/...`` through the Converse adapter, which
174
+ # picks up AWS credentials from the standard env vars
175
+ # (``AWS_ACCESS_KEY_ID`` / ``AWS_SECRET_ACCESS_KEY`` / ``AWS_REGION``).
176
+ # The Anthropic thinking/effort shape is not forwarded through Converse
177
+ # the same way, so we leave it off for now.
178
+ return {"model": model_name}
179
+
180
+ # ── Local / self-hosted providers (OpenAI-compatible endpoints) ──────────
181
+ # These prefixes route to local inference servers. LiteLLM's ``openai/``
182
+ # adapter is used with a custom ``api_base`` (and optionally ``api_key``).
183
+ # Reasoning effort is skipped β€” local servers rarely support it.
184
+ _LOCAL_PROVIDERS: dict[str, tuple[str, str | None]] = {
185
+ # prefix β†’ (default api_base, env var for api_base override)
186
+ "llamacpp/": ("http://localhost:8080/v1", "LLAMACPP_API_BASE"),
187
+ "lmstudio/": ("http://localhost:1234/v1", "LMSTUDIO_API_BASE"),
188
+ "mlx/": ("http://localhost:8000/v1", "MLX_API_BASE"),
189
+ "nim/": ("http://localhost:8000/v1", "NIM_API_BASE"),
190
+ "local/": ("http://localhost:8000/v1", "LOCAL_API_BASE"),
191
+ "ollama/": ("http://localhost:11434/v1", "OLLAMA_API_BASE"),
192
+ "vllm/": ("http://localhost:8000/v1", "VLLM_API_BASE"),
193
+ "tgi/": ("http://localhost:8080/v1", "TGI_API_BASE"),
194
+ }
195
+ for prefix, (default_base, env_override) in _LOCAL_PROVIDERS.items():
196
+ if model_name.startswith(prefix):
197
+ api_base = os.environ.get(env_override, default_base)
198
+ api_key = os.environ.get("LOCAL_API_KEY", "no-key")
199
+ return {
200
+ "model": f"openai/{model_name.removeprefix(prefix)}",
201
+ "api_base": api_base,
202
+ "api_key": api_key,
203
+ }
204
+ # ─────────────────────────────────────────────────────────────────────────
205
+
206
+ if model_name.startswith("openai/"):
207
+ params = {"model": model_name}
208
+ if reasoning_effort:
209
+ if reasoning_effort not in _OPENAI_EFFORTS:
210
+ if strict:
211
+ raise UnsupportedEffortError(
212
+ f"OpenAI doesn't accept effort={reasoning_effort!r}"
213
+ )
214
+ else:
215
+ params["reasoning_effort"] = reasoning_effort
216
+ return params
217
+
218
+ hf_model = model_name.removeprefix("huggingface/")
219
+ api_key = _resolve_hf_router_token(session_hf_token)
220
+ params = {
221
+ "model": f"openai/{hf_model}",
222
+ "api_base": "https://router.huggingface.co/v1",
223
+ "api_key": api_key,
224
+ }
225
+ if bill_to := get_hf_bill_to():
226
+ params["extra_headers"] = {"X-HF-Bill-To": bill_to}
227
+ if reasoning_effort:
228
+ hf_level = "low" if reasoning_effort == "minimal" else reasoning_effort
229
+ if hf_level not in _HF_EFFORTS:
230
+ if strict:
231
+ raise UnsupportedEffortError(
232
+ f"HF router doesn't accept effort={hf_level!r}"
233
+ )
234
+ else:
235
+ params["extra_body"] = {"reasoning_effort": hf_level}
236
+ return params