akseljoonas HF Staff commited on
Commit
0a9e96d
·
1 Parent(s): 1c0de34

Route HF inference through /v1 auto-router + add reasoning_effort knob

Browse files

Users paste bare HF model ids (MiniMaxAI/MiniMax-M2.7, moonshotai/Kimi-K2.6)
with an optional :fastest|cheapest|preferred|<provider> suffix; the router
picks a provider and handles failover. /model does a live preflight against
/v1/models and prints providers, pricing, context, tool support — warn-and-
allow for unknowns with fuzzy suggestions. Friendly messages replace
LiteLLM's raw traceback for model/provider mismatches, and the noisy
'Give Feedback' banner is suppressed.

Adds a reasoning_effort config + /effort command (default high). OpenAI and
Anthropic get the top-level param natively; HF router gets it via extra_body
with minimal->low normalization for models like MiniMax M2 that require
reasoning. Frontend + backend model selectors updated to the bare-id format.

agent/config.py CHANGED
@@ -33,6 +33,15 @@ class Config(BaseModel):
33
  confirm_cpu_jobs: bool = True
34
  auto_file_upload: bool = False
35
 
 
 
 
 
 
 
 
 
 
36
 
37
  def substitute_env_vars(obj: Any) -> Any:
38
  """
 
33
  confirm_cpu_jobs: bool = True
34
  auto_file_upload: bool = False
35
 
36
+ # Reasoning effort for models that support it (GPT-5 / o-series, Claude
37
+ # extended thinking, HF reasoning models like MiniMax M2 / Kimi K2).
38
+ # Defaults to "high" — we'd rather spend tokens thinking than ship a
39
+ # wrong ML recipe. Users can dial down with `/effort low|medium|off`.
40
+ # "minimal" is an OpenAI-only level and is normalized to "low" for HF
41
+ # router models (MiniMax requires ≥low). Ignored for non-reasoning models.
42
+ # Valid values: None | "minimal" | "low" | "medium" | "high"
43
+ reasoning_effort: str | None = "high"
44
+
45
 
46
  def substitute_env_vars(obj: Any) -> Any:
47
  """
agent/context_manager/manager.py CHANGED
@@ -306,19 +306,14 @@ class ContextManager:
306
  )
307
  )
308
 
309
- hf_key = (
310
- os.environ.get("INFERENCE_TOKEN")
311
- or hf_token
312
- or os.environ.get("HF_TOKEN")
313
- )
314
  response = await acompletion(
315
- model=model_name,
316
  messages=messages_to_summarize,
317
  max_completion_tokens=self.compact_size,
318
  tools=tool_specs,
319
- api_key=hf_key
320
- if hf_key and model_name.startswith("huggingface/")
321
- else None,
322
  )
323
  summarized_message = Message(
324
  role="assistant", content=response.choices[0].message.content
 
306
  )
307
  )
308
 
309
+ from agent.core.llm_params import _resolve_llm_params
310
+
311
+ llm_params = _resolve_llm_params(model_name, hf_token, reasoning_effort="high")
 
 
312
  response = await acompletion(
 
313
  messages=messages_to_summarize,
314
  max_completion_tokens=self.compact_size,
315
  tools=tool_specs,
316
+ **llm_params,
 
 
317
  )
318
  summarized_message = Message(
319
  role="assistant", content=response.choices[0].message.content
agent/core/agent_loop.py CHANGED
@@ -13,6 +13,7 @@ from litellm.exceptions import ContextWindowExceededError
13
 
14
  from agent.config import Config
15
  from agent.core.doom_loop import check_for_doom_loop
 
16
  from agent.core.session import Event, OpType, Session
17
  from agent.core.tools import ToolRouter
18
  from agent.tools.jobs_tool import CPU_FLAVORS
@@ -22,51 +23,6 @@ logger = logging.getLogger(__name__)
22
  ToolCall = ChatCompletionMessageToolCall
23
 
24
 
25
- def _resolve_hf_router_params(
26
- model_name: str, session_hf_token: str | None = None
27
- ) -> dict:
28
- """
29
- Build LiteLLM kwargs for HuggingFace Router models.
30
-
31
- api-inference.huggingface.co is deprecated; the new router lives at
32
- router.huggingface.co/<provider>/v3/openai. LiteLLM's built-in
33
- ``huggingface/`` provider still targets the old endpoint, so we
34
- rewrite model names to ``openai/`` and supply the correct api_base.
35
-
36
- Input format: huggingface/<router_provider>/<org>/<model>
37
- Example: huggingface/novita/moonshotai/kimi-k2.5
38
-
39
- Token resolution (first non-empty wins):
40
- 1. INFERENCE_TOKEN env — shared key on the hosted Space so inference
41
- is free for users and billed to the Space owner.
42
- 2. session.hf_token — the user's own token (CLI or self-hosted),
43
- resolved from env / huggingface-cli login / cached token file.
44
- 3. HF_TOKEN env — belt-and-suspenders fallback for CLI users.
45
- """
46
- if not model_name.startswith("huggingface/"):
47
- return {"model": model_name}
48
-
49
- parts = model_name.split(
50
- "/", 2
51
- ) # ['huggingface', 'novita', 'moonshotai/kimi-k2.5']
52
- if len(parts) < 3:
53
- return {"model": model_name}
54
-
55
- router_provider = parts[1]
56
- actual_model = parts[2]
57
- api_key = (
58
- os.environ.get("INFERENCE_TOKEN")
59
- or session_hf_token
60
- or os.environ.get("HF_TOKEN")
61
- )
62
-
63
- return {
64
- "model": f"openai/{actual_model}",
65
- "api_base": f"https://router.huggingface.co/{router_provider}/v3/openai",
66
- "api_key": api_key,
67
- }
68
-
69
-
70
  def _validate_tool_args(tool_args: dict) -> tuple[bool, str | None]:
71
  """
72
  Validate tool arguments structure.
@@ -201,6 +157,24 @@ def _friendly_error_message(error: Exception) -> str | None:
201
  "at your model provider's dashboard."
202
  )
203
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  return None
205
 
206
 
@@ -518,8 +492,10 @@ class Handlers:
518
  tools = session.tool_router.get_tool_specs_for_llm()
519
  try:
520
  # ── Call the LLM (streaming or non-streaming) ──
521
- llm_params = _resolve_hf_router_params(
522
- session.config.model_name, session.hf_token
 
 
523
  )
524
  if session.stream:
525
  llm_result = await _call_llm_streaming(session, messages, tools, llm_params)
 
13
 
14
  from agent.config import Config
15
  from agent.core.doom_loop import check_for_doom_loop
16
+ from agent.core.llm_params import _resolve_llm_params
17
  from agent.core.session import Event, OpType, Session
18
  from agent.core.tools import ToolRouter
19
  from agent.tools.jobs_tool import CPU_FLAVORS
 
23
  ToolCall = ChatCompletionMessageToolCall
24
 
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def _validate_tool_args(tool_args: dict) -> tuple[bool, str | None]:
27
  """
28
  Validate tool arguments structure.
 
157
  "at your model provider's dashboard."
158
  )
159
 
160
+ if "not supported by provider" in err_str or "no provider supports" in err_str:
161
+ return (
162
+ "The model isn't served by the provider you pinned.\n\n"
163
+ "Drop the ':<provider>' suffix to let the HF router auto-pick a "
164
+ "provider, or use '/model' (no arg) to see which providers host "
165
+ "which models."
166
+ )
167
+
168
+ if "model_not_found" in err_str or (
169
+ "model" in err_str
170
+ and ("not found" in err_str or "does not exist" in err_str)
171
+ ):
172
+ return (
173
+ "Model not found. Use '/model' to list suggestions, or paste an "
174
+ "HF model id like 'MiniMaxAI/MiniMax-M2.7'. Availability is shown "
175
+ "when you switch."
176
+ )
177
+
178
  return None
179
 
180
 
 
492
  tools = session.tool_router.get_tool_specs_for_llm()
493
  try:
494
  # ── Call the LLM (streaming or non-streaming) ──
495
+ llm_params = _resolve_llm_params(
496
+ session.config.model_name,
497
+ session.hf_token,
498
+ reasoning_effort=session.config.reasoning_effort,
499
  )
500
  if session.stream:
501
  llm_result = await _call_llm_streaming(session, messages, tools, llm_params)
agent/core/hf_router_catalog.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Fetch and cache the HF Inference Router model catalog.
2
+
3
+ The router exposes an OpenAI-compatible listing at
4
+ ``https://router.huggingface.co/v1/models`` with per-provider availability,
5
+ pricing, context length, and tool-use support. We use it to:
6
+
7
+ • Validate ``/model`` switches with live data instead of a hard-coded allowlist.
8
+ • Show the user which providers serve a model, at what price, and whether they
9
+ support tool calls.
10
+ • Derive a reasonable context-window limit for any routed model.
11
+
12
+ The listing is cached in-memory for a few minutes so repeated lookups during a
13
+ session are free. On fetch failure we return stale data if we have it, or an
14
+ empty catalog otherwise.
15
+ """
16
+
17
+ import logging
18
+ import time
19
+ from dataclasses import dataclass
20
+ from difflib import get_close_matches
21
+ from typing import Optional
22
+
23
+ import httpx
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+ _CATALOG_URL = "https://router.huggingface.co/v1/models"
28
+ _CACHE_TTL_SECONDS = 300
29
+ _HTTP_TIMEOUT_SECONDS = 5.0
30
+
31
+ _cache: Optional[dict] = None
32
+ _cache_time: float = 0.0
33
+
34
+
35
+ @dataclass
36
+ class ProviderInfo:
37
+ provider: str
38
+ status: str
39
+ context_length: Optional[int]
40
+ input_price: Optional[float]
41
+ output_price: Optional[float]
42
+ supports_tools: bool
43
+ supports_structured_output: bool
44
+
45
+
46
+ @dataclass
47
+ class ModelInfo:
48
+ id: str
49
+ providers: list[ProviderInfo]
50
+
51
+ @property
52
+ def live_providers(self) -> list[ProviderInfo]:
53
+ return [p for p in self.providers if p.status == "live"]
54
+
55
+ @property
56
+ def max_context_length(self) -> Optional[int]:
57
+ lengths = [p.context_length for p in self.live_providers if p.context_length]
58
+ return max(lengths) if lengths else None
59
+
60
+ @property
61
+ def any_supports_tools(self) -> bool:
62
+ return any(p.supports_tools for p in self.live_providers)
63
+
64
+
65
+ def _fetch_catalog(force: bool = False) -> dict:
66
+ global _cache, _cache_time
67
+ now = time.time()
68
+ if not force and _cache is not None and now - _cache_time < _CACHE_TTL_SECONDS:
69
+ return _cache
70
+ try:
71
+ resp = httpx.get(_CATALOG_URL, timeout=_HTTP_TIMEOUT_SECONDS)
72
+ resp.raise_for_status()
73
+ _cache = resp.json()
74
+ _cache_time = now
75
+ except Exception as e:
76
+ logger.warning("Failed to fetch HF router catalog: %s", e)
77
+ if _cache is None:
78
+ _cache = {"data": []}
79
+ _cache_time = now
80
+ return _cache
81
+
82
+
83
+ def _parse_entry(entry: dict) -> ModelInfo:
84
+ providers = []
85
+ for p in entry.get("providers", []) or []:
86
+ pricing = p.get("pricing") or {}
87
+ providers.append(
88
+ ProviderInfo(
89
+ provider=p.get("provider", ""),
90
+ status=p.get("status", ""),
91
+ context_length=p.get("context_length"),
92
+ input_price=pricing.get("input"),
93
+ output_price=pricing.get("output"),
94
+ supports_tools=bool(p.get("supports_tools", False)),
95
+ supports_structured_output=bool(p.get("supports_structured_output", False)),
96
+ )
97
+ )
98
+ return ModelInfo(id=entry.get("id", ""), providers=providers)
99
+
100
+
101
+ def lookup(model_id: str) -> Optional[ModelInfo]:
102
+ """Find a model in the router catalog.
103
+
104
+ Accepts ``<org>/<model>`` or ``<org>/<model>:<tag>`` — the tag is stripped
105
+ for lookup. Returns ``None`` if the model isn't listed.
106
+ """
107
+ bare = model_id.split(":", 1)[0]
108
+ catalog = _fetch_catalog()
109
+ for entry in catalog.get("data", []):
110
+ if entry.get("id") == bare:
111
+ return _parse_entry(entry)
112
+ return None
113
+
114
+
115
+ def fuzzy_suggest(model_id: str, limit: int = 3) -> list[str]:
116
+ """Return the closest model ids from the catalog."""
117
+ bare = model_id.split(":", 1)[0]
118
+ catalog = _fetch_catalog()
119
+ ids = [e.get("id", "") for e in catalog.get("data", []) if e.get("id")]
120
+ return get_close_matches(bare, ids, n=limit, cutoff=0.4)
121
+
122
+
123
+ def prewarm() -> None:
124
+ """Fetch the catalog so subsequent lookups are instant. Safe to call from
125
+ a background task — swallows failures."""
126
+ try:
127
+ _fetch_catalog(force=False)
128
+ except Exception:
129
+ pass
agent/core/llm_params.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """LiteLLM kwargs resolution for the model ids this agent accepts.
2
+
3
+ Kept separate from ``agent_loop`` so tools (research, context compaction, etc.)
4
+ can import it without pulling in the whole agent loop / tool router and
5
+ creating circular imports.
6
+ """
7
+
8
+ import os
9
+
10
+
11
+ # HF router reasoning models only accept "low" | "medium" | "high" (e.g.
12
+ # MiniMax M2 actually *requires* reasoning to be enabled). OpenAI's GPT-5
13
+ # also accepts "minimal" for near-zero thinking. We map "minimal" to "low"
14
+ # for HF so the user doesn't get a 400.
15
+ _HF_ALLOWED_EFFORTS = {"low", "medium", "high"}
16
+
17
+
18
+ def _resolve_llm_params(
19
+ model_name: str,
20
+ session_hf_token: str | None = None,
21
+ reasoning_effort: str | None = None,
22
+ ) -> dict:
23
+ """
24
+ Build LiteLLM kwargs for a given model id.
25
+
26
+ • ``anthropic/<model>`` / ``openai/<model>`` — passed straight through; the
27
+ user's own ``ANTHROPIC_API_KEY`` / ``OPENAI_API_KEY`` env vars are picked
28
+ up by LiteLLM. ``reasoning_effort`` is forwarded as a top-level param
29
+ (GPT-5 / o-series accept "minimal" | "low" | "medium" | "high"; Claude
30
+ extended-thinking models accept "low" | "medium" | "high" and LiteLLM
31
+ translates to the thinking config).
32
+
33
+ • Anything else is treated as a HuggingFace router id. We hit the
34
+ auto-routing OpenAI-compatible endpoint at
35
+ ``https://router.huggingface.co/v1``, which bypasses LiteLLM's stale
36
+ per-provider HF adapter entirely. The id can be bare or carry an HF
37
+ routing suffix:
38
+
39
+ MiniMaxAI/MiniMax-M2.7 # auto = fastest + failover
40
+ MiniMaxAI/MiniMax-M2.7:cheapest
41
+ moonshotai/Kimi-K2.6:novita # pin a specific provider
42
+
43
+ A leading ``huggingface/`` is stripped for convenience. ``reasoning_effort``
44
+ is forwarded via ``extra_body`` (LiteLLM's OpenAI adapter refuses it as a
45
+ top-level kwarg for non-OpenAI models). "minimal" is normalized to "low".
46
+
47
+ Token precedence (first non-empty wins):
48
+ 1. INFERENCE_TOKEN env — shared key on the hosted Space (inference is
49
+ free for users, billed to the Space owner via ``X-HF-Bill-To``).
50
+ 2. session.hf_token — the user's own token (CLI / OAuth / cache file).
51
+ 3. HF_TOKEN env — belt-and-suspenders fallback for CLI users.
52
+ """
53
+ if model_name.startswith(("anthropic/", "openai/")):
54
+ params: dict = {"model": model_name}
55
+ if reasoning_effort:
56
+ params["reasoning_effort"] = reasoning_effort
57
+ return params
58
+
59
+ hf_model = model_name.removeprefix("huggingface/")
60
+ api_key = (
61
+ os.environ.get("INFERENCE_TOKEN")
62
+ or session_hf_token
63
+ or os.environ.get("HF_TOKEN")
64
+ )
65
+ params = {
66
+ "model": f"openai/{hf_model}",
67
+ "api_base": "https://router.huggingface.co/v1",
68
+ "api_key": api_key,
69
+ }
70
+ if os.environ.get("INFERENCE_TOKEN"):
71
+ params["extra_headers"] = {"X-HF-Bill-To": "huggingface"}
72
+ if reasoning_effort:
73
+ hf_level = "low" if reasoning_effort == "minimal" else reasoning_effort
74
+ if hf_level in _HF_ALLOWED_EFFORTS:
75
+ params["extra_body"] = {"reasoning_effort": hf_level}
76
+ return params
agent/core/session.py CHANGED
@@ -18,7 +18,6 @@ logger = logging.getLogger(__name__)
18
  # Local max-token lookup — avoids litellm.get_max_tokens() which can hang
19
  # on network calls for certain providers (known litellm issue).
20
  _MAX_TOKENS_MAP: dict[str, int] = {
21
- # Anthropic
22
  "anthropic/claude-opus-4-6": 200_000,
23
  "anthropic/claude-opus-4-5-20251101": 200_000,
24
  "anthropic/claude-sonnet-4-5-20250929": 200_000,
@@ -26,20 +25,32 @@ _MAX_TOKENS_MAP: dict[str, int] = {
26
  "anthropic/claude-haiku-3-5-20241022": 200_000,
27
  "anthropic/claude-3-5-sonnet-20241022": 200_000,
28
  "anthropic/claude-3-opus-20240229": 200_000,
29
- "huggingface/fireworks-ai/MiniMaxAI/MiniMax-M2.5": 200_000,
30
- "huggingface/novita/minimax/minimax-m2.1": 196_608,
31
- "huggingface/novita/moonshotai/kimi-k2.5": 262_144,
32
- "huggingface/novita/zai-org/glm-5": 200_000,
33
  }
34
  _DEFAULT_MAX_TOKENS = 200_000
35
 
36
 
37
  def _get_max_tokens_safe(model_name: str) -> int:
38
- """Return the max context window for a model without network calls."""
 
 
 
 
 
39
  tokens = _MAX_TOKENS_MAP.get(model_name)
40
  if tokens:
41
  return tokens
42
- # Fallback: try litellm but with a short timeout via threading
 
 
 
 
 
 
 
 
 
 
 
43
  try:
44
  from litellm import get_max_tokens
45
 
@@ -49,10 +60,9 @@ def _get_max_tokens_safe(model_name: str) -> int:
49
  logger.warning(
50
  f"get_max_tokens returned {result} for {model_name}, using default"
51
  )
52
- return _DEFAULT_MAX_TOKENS
53
  except Exception as e:
54
  logger.warning(f"get_max_tokens failed for {model_name}, using default: {e}")
55
- return _DEFAULT_MAX_TOKENS
56
 
57
 
58
  class OpType(Enum):
 
18
  # Local max-token lookup — avoids litellm.get_max_tokens() which can hang
19
  # on network calls for certain providers (known litellm issue).
20
  _MAX_TOKENS_MAP: dict[str, int] = {
 
21
  "anthropic/claude-opus-4-6": 200_000,
22
  "anthropic/claude-opus-4-5-20251101": 200_000,
23
  "anthropic/claude-sonnet-4-5-20250929": 200_000,
 
25
  "anthropic/claude-haiku-3-5-20241022": 200_000,
26
  "anthropic/claude-3-5-sonnet-20241022": 200_000,
27
  "anthropic/claude-3-opus-20240229": 200_000,
 
 
 
 
28
  }
29
  _DEFAULT_MAX_TOKENS = 200_000
30
 
31
 
32
  def _get_max_tokens_safe(model_name: str) -> int:
33
+ """Return the max context window for a model.
34
+
35
+ Anthropic/OpenAI ids hit the local table; HF router ids ask the catalog
36
+ (cached) for the max ``context_length`` across live providers. Falls back
37
+ to ``_DEFAULT_MAX_TOKENS`` if nothing is available.
38
+ """
39
  tokens = _MAX_TOKENS_MAP.get(model_name)
40
  if tokens:
41
  return tokens
42
+
43
+ if not model_name.startswith(("anthropic/", "openai/")):
44
+ try:
45
+ from agent.core import hf_router_catalog as cat
46
+
47
+ bare = model_name.removeprefix("huggingface/").split(":", 1)[0]
48
+ info = cat.lookup(bare)
49
+ if info and info.max_context_length:
50
+ return info.max_context_length
51
+ except Exception as e:
52
+ logger.warning("HF catalog lookup failed for %s: %s", model_name, e)
53
+
54
  try:
55
  from litellm import get_max_tokens
56
 
 
60
  logger.warning(
61
  f"get_max_tokens returned {result} for {model_name}, using default"
62
  )
 
63
  except Exception as e:
64
  logger.warning(f"get_max_tokens failed for {model_name}, using default: {e}")
65
+ return _DEFAULT_MAX_TOKENS
66
 
67
 
68
  class OpType(Enum):
agent/main.py CHANGED
@@ -44,39 +44,41 @@ from agent.utils.terminal_display import (
44
  )
45
 
46
  litellm.drop_params = True
 
 
 
47
 
48
  # ── Suggested models shown by `/model` (not a gate) ──────────────────────
49
- # Any model id accepted by litellm is usable; for the HF router the form is
50
- # `huggingface/<inference_provider>/<org>/<model>` and users can pick any
51
- # model supported by any HF inference provider.
 
52
  SUGGESTED_MODELS = [
53
  {"id": "anthropic/claude-opus-4-6", "label": "Claude Opus 4.6"},
54
- {"id": "huggingface/fireworks-ai/MiniMaxAI/MiniMax-M2.5", "label": "MiniMax M2.5"},
55
- {"id": "huggingface/novita/moonshotai/kimi-k2.5", "label": "Kimi K2.5"},
56
- {"id": "huggingface/novita/zai-org/glm-5", "label": "GLM 5"},
57
  ]
58
 
59
 
60
  def _is_valid_model_id(model_id: str) -> bool:
61
- """Loose format check — lets users pick any inference-provider model.
62
 
63
  Accepts:
64
- • huggingface/<provider>/<org>/<model> (HF router)
65
  • anthropic/<model>
66
  • openai/<model>
67
- Actual availability is verified by the provider when the first call
68
- is made; we don't want to maintain a hardcoded allowlist.
 
 
 
69
  """
70
  if not model_id or "/" not in model_id:
71
  return False
72
- if model_id.startswith("huggingface/"):
73
- # needs provider + org + model → at least 3 slashes after the prefix
74
- parts = model_id.split("/")
75
- return len(parts) >= 4 and all(parts)
76
- if model_id.startswith(("anthropic/", "openai/")):
77
- parts = model_id.split("/", 1)
78
- return len(parts) == 2 and bool(parts[1])
79
- return False
80
 
81
 
82
  def _safe_get_args(arguments: dict) -> dict:
@@ -88,6 +90,80 @@ def _safe_get_args(arguments: dict) -> dict:
88
  return args if isinstance(args, dict) else {}
89
 
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  def _get_hf_token() -> str | None:
92
  """Get HF token from environment, huggingface_hub API, or cached token file."""
93
  token = os.environ.get("HF_TOKEN")
@@ -691,35 +767,37 @@ def _handle_slash_command(
691
  )
692
 
693
  if command == "/model":
 
694
  if not arg:
695
  current = config.model_name if config else ""
696
- print("Current model:")
697
- print(f" {current}")
698
- print("\nSuggested models (any HF inference-provider model works):")
699
  for m in SUGGESTED_MODELS:
700
- marker = " <-- current" if m["id"] == current else ""
701
- print(f" {m['id']} ({m['label']}){marker}")
702
- print(
703
- "\nPass any id, e.g. huggingface/<provider>/<org>/<model>.\n"
704
- "Availability is verified on first use."
 
705
  )
706
  return None
707
  if not _is_valid_model_id(arg):
708
- print(f"Invalid model id format: {arg}")
709
- print(
710
- "Expected one of:\n"
711
- " • huggingface/<provider>/<org>/<model>\n"
712
  " • anthropic/<model>\n"
713
- " • openai/<model>"
714
  )
715
  return None
 
 
716
  session = session_holder[0] if session_holder else None
717
  if session:
718
- session.update_model(arg)
719
- print(f"Model switched to {arg}")
720
  else:
721
- config.model_name = arg
722
- print(f"Model set to {arg} (session not started yet)")
723
  return None
724
 
725
  if command == "/yolo":
@@ -728,9 +806,31 @@ def _handle_slash_command(
728
  print(f"YOLO mode: {state}")
729
  return None
730
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
731
  if command == "/status":
732
  session = session_holder[0] if session_holder else None
733
  print(f"Model: {config.model_name}")
 
734
  if session:
735
  print(f"Turns: {session.turn_count}")
736
  print(f"Context items: {len(session.context_manager.items)}")
@@ -764,6 +864,11 @@ async def main():
764
 
765
  print_banner(hf_user=hf_user)
766
 
 
 
 
 
 
767
  # Create queues for communication
768
  submission_queue = asyncio.Queue()
769
  event_queue = asyncio.Queue()
 
44
  )
45
 
46
  litellm.drop_params = True
47
+ # Suppress the "Give Feedback / Get Help" banner LiteLLM prints to stderr
48
+ # on every error — users don't need it, and our friendly errors cover the case.
49
+ litellm.suppress_debug_info = True
50
 
51
  # ── Suggested models shown by `/model` (not a gate) ──────────────────────
52
+ # Users can paste any HF model id (e.g. "MiniMaxAI/MiniMax-M2.7") or use one
53
+ # of the `anthropic/` / `openai/` prefixes for direct API access. For HF ids,
54
+ # append ":fastest" / ":cheapest" / ":preferred" / ":<provider>" to override
55
+ # the default routing policy (auto = fastest with failover).
56
  SUGGESTED_MODELS = [
57
  {"id": "anthropic/claude-opus-4-6", "label": "Claude Opus 4.6"},
58
+ {"id": "MiniMaxAI/MiniMax-M2.7", "label": "MiniMax M2.7"},
59
+ {"id": "moonshotai/Kimi-K2.6", "label": "Kimi K2.6"},
60
+ {"id": "zai-org/GLM-5.1", "label": "GLM 5.1"},
61
  ]
62
 
63
 
64
  def _is_valid_model_id(model_id: str) -> bool:
65
+ """Loose format check — lets users pick any model id.
66
 
67
  Accepts:
 
68
  • anthropic/<model>
69
  • openai/<model>
70
+ <org>/<model>[:<tag>] (HF router; tag = provider or policy)
71
+ huggingface/<org>/<model>[:<tag>] (same, accepts legacy prefix)
72
+
73
+ Actual availability is verified against the HF router catalog on switch,
74
+ or by the provider on first call.
75
  """
76
  if not model_id or "/" not in model_id:
77
  return False
78
+ # Strip :tag suffix before structural check
79
+ head = model_id.split(":", 1)[0]
80
+ parts = head.split("/")
81
+ return len(parts) >= 2 and all(parts)
 
 
 
 
82
 
83
 
84
  def _safe_get_args(arguments: dict) -> dict:
 
90
  return args if isinstance(args, dict) else {}
91
 
92
 
93
+ _ROUTING_POLICIES = {"fastest", "cheapest", "preferred"}
94
+
95
+
96
+ def _print_model_preflight(model_id: str, console) -> None:
97
+ """Validate a model switch against the HF router catalog and show the
98
+ user what they're about to use (providers, price, context, tool support).
99
+
100
+ Anthropic/OpenAI ids skip the catalog — those are direct API calls.
101
+ For unknown HF ids we print a red warning with fuzzy suggestions but
102
+ still allow the switch (the catalog might be lagging).
103
+ """
104
+ if model_id.startswith(("anthropic/", "openai/")):
105
+ console.print(f"[green]Model switched to {model_id}[/green]")
106
+ return
107
+
108
+ from agent.core import hf_router_catalog as cat
109
+
110
+ bare, _, tag = model_id.partition(":")
111
+ info = cat.lookup(bare)
112
+ if info is None:
113
+ console.print(
114
+ f"[bold red]Warning:[/bold red] '{bare}' isn't in the HF router "
115
+ "catalog. Switching anyway — first call may fail."
116
+ )
117
+ suggestions = cat.fuzzy_suggest(bare)
118
+ if suggestions:
119
+ console.print(f"[dim]Did you mean: {', '.join(suggestions)}[/dim]")
120
+ return
121
+
122
+ live = info.live_providers
123
+ if not live:
124
+ console.print(
125
+ f"[bold red]Warning:[/bold red] '{bare}' has no live providers "
126
+ "right now. First call will likely fail."
127
+ )
128
+ return
129
+
130
+ if tag and tag not in _ROUTING_POLICIES:
131
+ matched = [p for p in live if p.provider == tag]
132
+ if not matched:
133
+ names = ", ".join(p.provider for p in live)
134
+ console.print(
135
+ f"[bold red]Warning:[/bold red] provider '{tag}' doesn't serve "
136
+ f"'{bare}'. Live providers: {names}. Switching anyway."
137
+ )
138
+ return
139
+
140
+ if not info.any_supports_tools:
141
+ console.print(
142
+ f"[bold red]Warning:[/bold red] no provider for '{bare}' advertises "
143
+ "tool-call support. This agent relies on tool calls — expect errors."
144
+ )
145
+
146
+ console.print(f"[green]Model switched to {model_id}[/green]")
147
+ if tag in _ROUTING_POLICIES:
148
+ policy = tag
149
+ elif tag:
150
+ policy = f"pinned to {tag}"
151
+ else:
152
+ policy = "auto (fastest)"
153
+ console.print(f" [dim]routing: {policy}[/dim]")
154
+ for p in live:
155
+ price = (
156
+ f"${p.input_price:g}/${p.output_price:g} per M tok"
157
+ if p.input_price is not None and p.output_price is not None
158
+ else "price n/a"
159
+ )
160
+ ctx = f"{p.context_length:,} ctx" if p.context_length else "ctx n/a"
161
+ tools = "tools" if p.supports_tools else "no tools"
162
+ console.print(
163
+ f" [dim]{p.provider}: {price}, {ctx}, {tools}[/dim]"
164
+ )
165
+
166
+
167
  def _get_hf_token() -> str | None:
168
  """Get HF token from environment, huggingface_hub API, or cached token file."""
169
  token = os.environ.get("HF_TOKEN")
 
767
  )
768
 
769
  if command == "/model":
770
+ console = get_console()
771
  if not arg:
772
  current = config.model_name if config else ""
773
+ console.print("[bold]Current model:[/bold]")
774
+ console.print(f" {current}")
775
+ console.print("\n[bold]Suggested:[/bold]")
776
  for m in SUGGESTED_MODELS:
777
+ marker = " [dim]<-- current[/dim]" if m["id"] == current else ""
778
+ console.print(f" {m['id']} [dim]({m['label']})[/dim]{marker}")
779
+ console.print(
780
+ "\n[dim]Paste any HF model id (e.g. 'MiniMaxAI/MiniMax-M2.7').\n"
781
+ "Add ':fastest', ':cheapest', ':preferred', or ':<provider>' to override routing.\n"
782
+ "Use 'anthropic/<model>' or 'openai/<model>' for direct API access.[/dim]"
783
  )
784
  return None
785
  if not _is_valid_model_id(arg):
786
+ console.print(f"[bold red]Invalid model id format:[/bold red] {arg}")
787
+ console.print(
788
+ "[dim]Expected:\n"
789
+ " • <org>/<model>[:tag] (HF router — paste from huggingface.co)\n"
790
  " • anthropic/<model>\n"
791
+ " • openai/<model>[/dim]"
792
  )
793
  return None
794
+ normalized = arg.removeprefix("huggingface/")
795
+ _print_model_preflight(normalized, console)
796
  session = session_holder[0] if session_holder else None
797
  if session:
798
+ session.update_model(normalized)
 
799
  else:
800
+ config.model_name = normalized
 
801
  return None
802
 
803
  if command == "/yolo":
 
806
  print(f"YOLO mode: {state}")
807
  return None
808
 
809
+ if command == "/effort":
810
+ console = get_console()
811
+ valid = {"minimal", "low", "medium", "high", "off"}
812
+ if not arg:
813
+ current = config.reasoning_effort or "off"
814
+ console.print(f"[bold]Reasoning effort:[/bold] {current}")
815
+ console.print(
816
+ "[dim]Set with '/effort minimal|low|medium|high|off'. "
817
+ "Applies to models that support it (GPT-5 / o-series, Claude "
818
+ "extended thinking, HF reasoning models); dropped otherwise.[/dim]"
819
+ )
820
+ return None
821
+ level = arg.lower()
822
+ if level not in valid:
823
+ console.print(f"[bold red]Invalid level:[/bold red] {arg}")
824
+ console.print(f"[dim]Expected one of: {', '.join(sorted(valid))}[/dim]")
825
+ return None
826
+ config.reasoning_effort = None if level == "off" else level
827
+ console.print(f"[green]Reasoning effort: {level}[/green]")
828
+ return None
829
+
830
  if command == "/status":
831
  session = session_holder[0] if session_holder else None
832
  print(f"Model: {config.model_name}")
833
+ print(f"Reasoning effort: {config.reasoning_effort or 'off'}")
834
  if session:
835
  print(f"Turns: {session.turn_count}")
836
  print(f"Context items: {len(session.context_manager.items)}")
 
864
 
865
  print_banner(hf_user=hf_user)
866
 
867
+ # Pre-warm the HF router catalog in the background so /model switches
868
+ # don't block on a network fetch.
869
+ from agent.core import hf_router_catalog
870
+ asyncio.create_task(asyncio.to_thread(hf_router_catalog.prewarm))
871
+
872
  # Create queues for communication
873
  submission_queue = asyncio.Queue()
874
  event_queue = asyncio.Queue()
agent/tools/research_tool.py CHANGED
@@ -9,12 +9,12 @@ Inspired by claude-code's code-explorer agent pattern.
9
 
10
  import json
11
  import logging
12
- import os
13
  from typing import Any
14
 
15
  from litellm import Message, acompletion
16
 
17
  from agent.core.doom_loop import check_for_doom_loop
 
18
  from agent.core.session import Event
19
 
20
  logger = logging.getLogger(__name__)
@@ -213,32 +213,6 @@ RESEARCH_TOOL_SPEC = {
213
  }
214
 
215
 
216
- def _resolve_llm_params(
217
- model_name: str, session_hf_token: str | None = None
218
- ) -> dict:
219
- """Build LiteLLM kwargs, reusing the HF router logic from agent_loop."""
220
- if not model_name.startswith("huggingface/"):
221
- return {"model": model_name}
222
-
223
- parts = model_name.split("/", 2) # ["huggingface", "<provider>", "<org>/<model>"]
224
- if len(parts) < 3:
225
- return {"model": model_name}
226
-
227
- provider = parts[1]
228
- model_id = parts[2]
229
- api_key = (
230
- os.environ.get("INFERENCE_TOKEN")
231
- or session_hf_token
232
- or os.environ.get("HF_TOKEN")
233
- or ""
234
- )
235
- return {
236
- "model": f"openai/{model_id}",
237
- "api_base": f"https://router.huggingface.co/{provider}/v3/openai",
238
- "api_key": api_key,
239
- }
240
-
241
-
242
  def _get_research_model(main_model: str) -> str:
243
  """Pick a cheaper model for research based on the main model."""
244
  if "anthropic/" in main_model:
@@ -272,7 +246,11 @@ async def research_handler(
272
  # Use a cheaper/faster model for research
273
  main_model = session.config.model_name
274
  research_model = _get_research_model(main_model)
275
- llm_params = _resolve_llm_params(research_model, getattr(session, "hf_token", None))
 
 
 
 
276
 
277
  # Get read-only tool specs from the session's tool router
278
  tool_specs = [
 
9
 
10
  import json
11
  import logging
 
12
  from typing import Any
13
 
14
  from litellm import Message, acompletion
15
 
16
  from agent.core.doom_loop import check_for_doom_loop
17
+ from agent.core.llm_params import _resolve_llm_params
18
  from agent.core.session import Event
19
 
20
  logger = logging.getLogger(__name__)
 
213
  }
214
 
215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  def _get_research_model(main_model: str) -> str:
217
  """Pick a cheaper model for research based on the main model."""
218
  if "anthropic/" in main_model:
 
246
  # Use a cheaper/faster model for research
247
  main_model = session.config.model_name
248
  research_model = _get_research_model(main_model)
249
+ llm_params = _resolve_llm_params(
250
+ research_model,
251
+ getattr(session, "hf_token", None),
252
+ reasoning_effort=getattr(session.config, "reasoning_effort", None),
253
+ )
254
 
255
  # Get read-only tool specs from the session's tool router
256
  tool_specs = [
agent/utils/terminal_display.py CHANGED
@@ -318,6 +318,7 @@ HELP_TEXT = f"""\
318
  {_I} [cyan]/undo[/cyan] Undo last turn
319
  {_I} [cyan]/compact[/cyan] Compact context window
320
  {_I} [cyan]/model[/cyan] [id] Show available models or switch
 
321
  {_I} [cyan]/yolo[/cyan] Toggle auto-approve mode
322
  {_I} [cyan]/status[/cyan] Current model & turn count
323
  {_I} [cyan]/quit[/cyan] Exit"""
 
318
  {_I} [cyan]/undo[/cyan] Undo last turn
319
  {_I} [cyan]/compact[/cyan] Compact context window
320
  {_I} [cyan]/model[/cyan] [id] Show available models or switch
321
+ {_I} [cyan]/effort[/cyan] [level] Reasoning effort (minimal|low|medium|high|off)
322
  {_I} [cyan]/yolo[/cyan] Toggle auto-approve mode
323
  {_I} [cyan]/status[/cyan] Current model & turn count
324
  {_I} [cyan]/quit[/cyan] Exit"""
backend/routes/agent.py CHANGED
@@ -30,7 +30,7 @@ from models import (
30
  )
31
  from session_manager import MAX_SESSIONS, SessionCapacityError, session_manager
32
 
33
- from agent.core.agent_loop import _resolve_hf_router_params
34
 
35
  logger = logging.getLogger(__name__)
36
 
@@ -44,19 +44,19 @@ AVAILABLE_MODELS = [
44
  "recommended": True,
45
  },
46
  {
47
- "id": "huggingface/fireworks-ai/MiniMaxAI/MiniMax-M2.5",
48
- "label": "MiniMax M2.5",
49
  "provider": "huggingface",
50
  "recommended": True,
51
  },
52
  {
53
- "id": "huggingface/novita/moonshotai/kimi-k2.5",
54
- "label": "Kimi K2.5",
55
  "provider": "huggingface",
56
  },
57
  {
58
- "id": "huggingface/novita/zai-org/glm-5",
59
- "label": "GLM 5",
60
  "provider": "huggingface",
61
  },
62
  ]
@@ -93,7 +93,7 @@ async def llm_health_check() -> LLMHealthResponse:
93
  """
94
  model = session_manager.config.model_name
95
  try:
96
- llm_params = _resolve_hf_router_params(model)
97
  await acompletion(
98
  messages=[{"role": "user", "content": "hi"}],
99
  max_tokens=1,
@@ -163,7 +163,7 @@ async def generate_title(
163
  ) -> dict:
164
  """Generate a short title for a chat session based on the first user message."""
165
  model = session_manager.config.model_name
166
- llm_params = _resolve_hf_router_params(model)
167
  try:
168
  response = await acompletion(
169
  messages=[
 
30
  )
31
  from session_manager import MAX_SESSIONS, SessionCapacityError, session_manager
32
 
33
+ from agent.core.llm_params import _resolve_llm_params
34
 
35
  logger = logging.getLogger(__name__)
36
 
 
44
  "recommended": True,
45
  },
46
  {
47
+ "id": "MiniMaxAI/MiniMax-M2.7",
48
+ "label": "MiniMax M2.7",
49
  "provider": "huggingface",
50
  "recommended": True,
51
  },
52
  {
53
+ "id": "moonshotai/Kimi-K2.6",
54
+ "label": "Kimi K2.6",
55
  "provider": "huggingface",
56
  },
57
  {
58
+ "id": "zai-org/GLM-5.1",
59
+ "label": "GLM 5.1",
60
  "provider": "huggingface",
61
  },
62
  ]
 
93
  """
94
  model = session_manager.config.model_name
95
  try:
96
+ llm_params = _resolve_llm_params(model, reasoning_effort="high")
97
  await acompletion(
98
  messages=[{"role": "user", "content": "hi"}],
99
  max_tokens=1,
 
163
  ) -> dict:
164
  """Generate a short title for a chat session based on the first user message."""
165
  model = session_manager.config.model_name
166
+ llm_params = _resolve_llm_params(model, reasoning_effort="high")
167
  try:
168
  response = await acompletion(
169
  messages=[
frontend/src/components/Chat/ChatInput.tsx CHANGED
@@ -30,26 +30,26 @@ const MODEL_OPTIONS: ModelOption[] = [
30
  recommended: true,
31
  },
32
  {
33
- id: 'minimax-m2.5',
34
- name: 'MiniMax M2.5',
35
- description: 'Via Fireworks',
36
- modelPath: 'huggingface/fireworks-ai/MiniMaxAI/MiniMax-M2.5',
37
- avatarUrl: getHfAvatarUrl('MiniMaxAI/MiniMax-M2.5'),
38
  recommended: true,
39
  },
40
  {
41
- id: 'kimi-k2.5',
42
- name: 'Kimi K2.5',
43
- description: 'Via Novita',
44
- modelPath: 'huggingface/novita/moonshotai/kimi-k2.5',
45
- avatarUrl: getHfAvatarUrl('moonshotai/Kimi-K2.5'),
46
  },
47
  {
48
- id: 'glm-5',
49
- name: 'GLM 5',
50
- description: 'Via Novita',
51
- modelPath: 'huggingface/novita/zai-org/glm-5',
52
- avatarUrl: getHfAvatarUrl('zai-org/GLM-5'),
53
  },
54
  ];
55
 
 
30
  recommended: true,
31
  },
32
  {
33
+ id: 'minimax-m2.7',
34
+ name: 'MiniMax M2.7',
35
+ description: 'HF auto-routed',
36
+ modelPath: 'MiniMaxAI/MiniMax-M2.7',
37
+ avatarUrl: getHfAvatarUrl('MiniMaxAI/MiniMax-M2.7'),
38
  recommended: true,
39
  },
40
  {
41
+ id: 'kimi-k2.6',
42
+ name: 'Kimi K2.6',
43
+ description: 'HF auto-routed',
44
+ modelPath: 'moonshotai/Kimi-K2.6',
45
+ avatarUrl: getHfAvatarUrl('moonshotai/Kimi-K2.6'),
46
  },
47
  {
48
+ id: 'glm-5.1',
49
+ name: 'GLM 5.1',
50
+ description: 'HF auto-routed',
51
+ modelPath: 'zai-org/GLM-5.1',
52
+ avatarUrl: getHfAvatarUrl('zai-org/GLM-5.1'),
53
  },
54
  ];
55