Spaces:

smolagents
/

ml-intern

Running on CPU Upgrade

akseljoonas HF Staff commited on 15 days ago

Commit

0a9e96d

1 Parent(s): 1c0de34

Route HF inference through /v1 auto-router + add reasoning_effort knob

Users paste bare HF model ids (MiniMaxAI/MiniMax-M2.7, moonshotai/Kimi-K2.6)
with an optional :fastest|cheapest|preferred|<provider> suffix; the router
picks a provider and handles failover. /model does a live preflight against
/v1/models and prints providers, pricing, context, tool support — warn-and-
allow for unknowns with fuzzy suggestions. Friendly messages replace
LiteLLM's raw traceback for model/provider mismatches, and the noisy
'Give Feedback' banner is suppressed.

Adds a reasoning_effort config + /effort command (default high). OpenAI and
Anthropic get the top-level param natively; HF router gets it via extra_body
with minimal->low normalization for models like MiniMax M2 that require
reasoning. Frontend + backend model selectors updated to the bare-id format.

Files changed (11) hide show

agent/config.py +9 -0
agent/context_manager/manager.py +4 -9
agent/core/agent_loop.py +23 -47
agent/core/hf_router_catalog.py +129 -0
agent/core/llm_params.py +76 -0
agent/core/session.py +19 -9
agent/main.py +140 -35
agent/tools/research_tool.py +6 -28
agent/utils/terminal_display.py +1 -0
backend/routes/agent.py +9 -9
frontend/src/components/Chat/ChatInput.tsx +15 -15

agent/config.py CHANGED Viewed

@@ -33,6 +33,15 @@ class Config(BaseModel):
     confirm_cpu_jobs: bool = True
     auto_file_upload: bool = False
 def substitute_env_vars(obj: Any) -> Any:
     """

     confirm_cpu_jobs: bool = True
     auto_file_upload: bool = False
+    # Reasoning effort for models that support it (GPT-5 / o-series, Claude
+    # extended thinking, HF reasoning models like MiniMax M2 / Kimi K2).
+    # Defaults to "high" — we'd rather spend tokens thinking than ship a
+    # wrong ML recipe. Users can dial down with `/effort low|medium|off`.
+    # "minimal" is an OpenAI-only level and is normalized to "low" for HF
+    # router models (MiniMax requires ≥low). Ignored for non-reasoning models.
+    # Valid values: None | "minimal" | "low" | "medium" | "high"
+    reasoning_effort: str | None = "high"
 def substitute_env_vars(obj: Any) -> Any:
     """

agent/context_manager/manager.py CHANGED Viewed

@@ -306,19 +306,14 @@ class ContextManager:
             )
         )
-        hf_key = (
-            os.environ.get("INFERENCE_TOKEN")
-            or hf_token
-            or os.environ.get("HF_TOKEN")
-        )
         response = await acompletion(
-            model=model_name,
             messages=messages_to_summarize,
             max_completion_tokens=self.compact_size,
             tools=tool_specs,
-            api_key=hf_key
-            if hf_key and model_name.startswith("huggingface/")
-            else None,
         )
         summarized_message = Message(
             role="assistant", content=response.choices[0].message.content

             )
         )
+        from agent.core.llm_params import _resolve_llm_params
+        llm_params = _resolve_llm_params(model_name, hf_token, reasoning_effort="high")
         response = await acompletion(
             messages=messages_to_summarize,
             max_completion_tokens=self.compact_size,
             tools=tool_specs,
+            **llm_params,
         )
         summarized_message = Message(
             role="assistant", content=response.choices[0].message.content

agent/core/agent_loop.py CHANGED Viewed

@@ -13,6 +13,7 @@ from litellm.exceptions import ContextWindowExceededError
 from agent.config import Config
 from agent.core.doom_loop import check_for_doom_loop
 from agent.core.session import Event, OpType, Session
 from agent.core.tools import ToolRouter
 from agent.tools.jobs_tool import CPU_FLAVORS
@@ -22,51 +23,6 @@ logger = logging.getLogger(__name__)
 ToolCall = ChatCompletionMessageToolCall
-def _resolve_hf_router_params(
-    model_name: str, session_hf_token: str | None = None
-) -> dict:
-    """
-    Build LiteLLM kwargs for HuggingFace Router models.
-    api-inference.huggingface.co is deprecated; the new router lives at
-    router.huggingface.co/<provider>/v3/openai.  LiteLLM's built-in
-    ``huggingface/`` provider still targets the old endpoint, so we
-    rewrite model names to ``openai/`` and supply the correct api_base.
-    Input format:  huggingface/<router_provider>/<org>/<model>
-    Example:       huggingface/novita/moonshotai/kimi-k2.5
-    Token resolution (first non-empty wins):
-      1. INFERENCE_TOKEN env — shared key on the hosted Space so inference
-         is free for users and billed to the Space owner.
-      2. session.hf_token — the user's own token (CLI or self-hosted),
-         resolved from env / huggingface-cli login / cached token file.
-      3. HF_TOKEN env — belt-and-suspenders fallback for CLI users.
-    """
-    if not model_name.startswith("huggingface/"):
-        return {"model": model_name}
-    parts = model_name.split(
-        "/", 2
-    )  # ['huggingface', 'novita', 'moonshotai/kimi-k2.5']
-    if len(parts) < 3:
-        return {"model": model_name}
-    router_provider = parts[1]
-    actual_model = parts[2]
-    api_key = (
-        os.environ.get("INFERENCE_TOKEN")
-        or session_hf_token
-        or os.environ.get("HF_TOKEN")
-    )
-    return {
-        "model": f"openai/{actual_model}",
-        "api_base": f"https://router.huggingface.co/{router_provider}/v3/openai",
-        "api_key": api_key,
-    }
 def _validate_tool_args(tool_args: dict) -> tuple[bool, str | None]:
     """
     Validate tool arguments structure.
@@ -201,6 +157,24 @@ def _friendly_error_message(error: Exception) -> str | None:
             "at your model provider's dashboard."
         )
     return None
@@ -518,8 +492,10 @@ class Handlers:
             tools = session.tool_router.get_tool_specs_for_llm()
             try:
                 # ── Call the LLM (streaming or non-streaming) ──
-                llm_params = _resolve_hf_router_params(
-                    session.config.model_name, session.hf_token
                 )
                 if session.stream:
                     llm_result = await _call_llm_streaming(session, messages, tools, llm_params)

 from agent.config import Config
 from agent.core.doom_loop import check_for_doom_loop
+from agent.core.llm_params import _resolve_llm_params
 from agent.core.session import Event, OpType, Session
 from agent.core.tools import ToolRouter
 from agent.tools.jobs_tool import CPU_FLAVORS
 ToolCall = ChatCompletionMessageToolCall
 def _validate_tool_args(tool_args: dict) -> tuple[bool, str | None]:
     """
     Validate tool arguments structure.
             "at your model provider's dashboard."
         )
+    if "not supported by provider" in err_str or "no provider supports" in err_str:
+        return (
+            "The model isn't served by the provider you pinned.\n\n"
+            "Drop the ':<provider>' suffix to let the HF router auto-pick a "
+            "provider, or use '/model' (no arg) to see which providers host "
+            "which models."
+        )
+    if "model_not_found" in err_str or (
+        "model" in err_str
+        and ("not found" in err_str or "does not exist" in err_str)
+    ):
+        return (
+            "Model not found. Use '/model' to list suggestions, or paste an "
+            "HF model id like 'MiniMaxAI/MiniMax-M2.7'. Availability is shown "
+            "when you switch."
+        )
     return None
             tools = session.tool_router.get_tool_specs_for_llm()
             try:
                 # ── Call the LLM (streaming or non-streaming) ──
+                llm_params = _resolve_llm_params(
+                    session.config.model_name,
+                    session.hf_token,
+                    reasoning_effort=session.config.reasoning_effort,
                 )
                 if session.stream:
                     llm_result = await _call_llm_streaming(session, messages, tools, llm_params)

agent/core/hf_router_catalog.py ADDED Viewed

	@@ -0,0 +1,129 @@

+"""Fetch and cache the HF Inference Router model catalog.
+The router exposes an OpenAI-compatible listing at
+``https://router.huggingface.co/v1/models`` with per-provider availability,
+pricing, context length, and tool-use support. We use it to:
+  • Validate ``/model`` switches with live data instead of a hard-coded allowlist.
+  • Show the user which providers serve a model, at what price, and whether they
+    support tool calls.
+  • Derive a reasonable context-window limit for any routed model.
+The listing is cached in-memory for a few minutes so repeated lookups during a
+session are free. On fetch failure we return stale data if we have it, or an
+empty catalog otherwise.
+"""
+import logging
+import time
+from dataclasses import dataclass
+from difflib import get_close_matches
+from typing import Optional
+import httpx
+logger = logging.getLogger(__name__)
+_CATALOG_URL = "https://router.huggingface.co/v1/models"
+_CACHE_TTL_SECONDS = 300
+_HTTP_TIMEOUT_SECONDS = 5.0
+_cache: Optional[dict] = None
+_cache_time: float = 0.0
+@dataclass
+class ProviderInfo:
+    provider: str
+    status: str
+    context_length: Optional[int]
+    input_price: Optional[float]
+    output_price: Optional[float]
+    supports_tools: bool
+    supports_structured_output: bool
+@dataclass
+class ModelInfo:
+    id: str
+    providers: list[ProviderInfo]
+    @property
+    def live_providers(self) -> list[ProviderInfo]:
+        return [p for p in self.providers if p.status == "live"]
+    @property
+    def max_context_length(self) -> Optional[int]:
+        lengths = [p.context_length for p in self.live_providers if p.context_length]
+        return max(lengths) if lengths else None
+    @property
+    def any_supports_tools(self) -> bool:
+        return any(p.supports_tools for p in self.live_providers)
+def _fetch_catalog(force: bool = False) -> dict:
+    global _cache, _cache_time
+    now = time.time()
+    if not force and _cache is not None and now - _cache_time < _CACHE_TTL_SECONDS:
+        return _cache
+    try:
+        resp = httpx.get(_CATALOG_URL, timeout=_HTTP_TIMEOUT_SECONDS)
+        resp.raise_for_status()
+        _cache = resp.json()
+        _cache_time = now
+    except Exception as e:
+        logger.warning("Failed to fetch HF router catalog: %s", e)
+        if _cache is None:
+            _cache = {"data": []}
+            _cache_time = now
+    return _cache
+def _parse_entry(entry: dict) -> ModelInfo:
+    providers = []
+    for p in entry.get("providers", []) or []:
+        pricing = p.get("pricing") or {}
+        providers.append(
+            ProviderInfo(
+                provider=p.get("provider", ""),
+                status=p.get("status", ""),
+                context_length=p.get("context_length"),
+                input_price=pricing.get("input"),
+                output_price=pricing.get("output"),
+                supports_tools=bool(p.get("supports_tools", False)),
+                supports_structured_output=bool(p.get("supports_structured_output", False)),
+            )
+        )
+    return ModelInfo(id=entry.get("id", ""), providers=providers)
+def lookup(model_id: str) -> Optional[ModelInfo]:
+    """Find a model in the router catalog.
+    Accepts ``<org>/<model>`` or ``<org>/<model>:<tag>`` — the tag is stripped
+    for lookup. Returns ``None`` if the model isn't listed.
+    """
+    bare = model_id.split(":", 1)[0]
+    catalog = _fetch_catalog()
+    for entry in catalog.get("data", []):
+        if entry.get("id") == bare:
+            return _parse_entry(entry)
+    return None
+def fuzzy_suggest(model_id: str, limit: int = 3) -> list[str]:
+    """Return the closest model ids from the catalog."""
+    bare = model_id.split(":", 1)[0]
+    catalog = _fetch_catalog()
+    ids = [e.get("id", "") for e in catalog.get("data", []) if e.get("id")]
+    return get_close_matches(bare, ids, n=limit, cutoff=0.4)
+def prewarm() -> None:
+    """Fetch the catalog so subsequent lookups are instant. Safe to call from
+    a background task — swallows failures."""
+    try:
+        _fetch_catalog(force=False)
+    except Exception:
+        pass

agent/core/llm_params.py ADDED Viewed

	@@ -0,0 +1,76 @@

+"""LiteLLM kwargs resolution for the model ids this agent accepts.
+Kept separate from ``agent_loop`` so tools (research, context compaction, etc.)
+can import it without pulling in the whole agent loop / tool router and
+creating circular imports.
+"""
+import os
+# HF router reasoning models only accept "low" | "medium" | "high" (e.g.
+# MiniMax M2 actually *requires* reasoning to be enabled). OpenAI's GPT-5
+# also accepts "minimal" for near-zero thinking. We map "minimal" to "low"
+# for HF so the user doesn't get a 400.
+_HF_ALLOWED_EFFORTS = {"low", "medium", "high"}
+def _resolve_llm_params(
+    model_name: str,
+    session_hf_token: str | None = None,
+    reasoning_effort: str | None = None,
+) -> dict:
+    """
+    Build LiteLLM kwargs for a given model id.
+    • ``anthropic/<model>`` / ``openai/<model>`` — passed straight through; the
+      user's own ``ANTHROPIC_API_KEY`` / ``OPENAI_API_KEY`` env vars are picked
+      up by LiteLLM. ``reasoning_effort`` is forwarded as a top-level param
+      (GPT-5 / o-series accept "minimal" | "low" | "medium" | "high"; Claude
+      extended-thinking models accept "low" | "medium" | "high" and LiteLLM
+      translates to the thinking config).
+    • Anything else is treated as a HuggingFace router id. We hit the
+      auto-routing OpenAI-compatible endpoint at
+      ``https://router.huggingface.co/v1``, which bypasses LiteLLM's stale
+      per-provider HF adapter entirely. The id can be bare or carry an HF
+      routing suffix:
+          MiniMaxAI/MiniMax-M2.7              # auto = fastest + failover
+          MiniMaxAI/MiniMax-M2.7:cheapest
+          moonshotai/Kimi-K2.6:novita         # pin a specific provider
+      A leading ``huggingface/`` is stripped for convenience. ``reasoning_effort``
+      is forwarded via ``extra_body`` (LiteLLM's OpenAI adapter refuses it as a
+      top-level kwarg for non-OpenAI models). "minimal" is normalized to "low".
+    Token precedence (first non-empty wins):
+      1. INFERENCE_TOKEN env — shared key on the hosted Space (inference is
+         free for users, billed to the Space owner via ``X-HF-Bill-To``).
+      2. session.hf_token — the user's own token (CLI / OAuth / cache file).
+      3. HF_TOKEN env — belt-and-suspenders fallback for CLI users.
+    """
+    if model_name.startswith(("anthropic/", "openai/")):
+        params: dict = {"model": model_name}
+        if reasoning_effort:
+            params["reasoning_effort"] = reasoning_effort
+        return params
+    hf_model = model_name.removeprefix("huggingface/")
+    api_key = (
+        os.environ.get("INFERENCE_TOKEN")
+        or session_hf_token
+        or os.environ.get("HF_TOKEN")
+    )
+    params = {
+        "model": f"openai/{hf_model}",
+        "api_base": "https://router.huggingface.co/v1",
+        "api_key": api_key,
+    }
+    if os.environ.get("INFERENCE_TOKEN"):
+        params["extra_headers"] = {"X-HF-Bill-To": "huggingface"}
+    if reasoning_effort:
+        hf_level = "low" if reasoning_effort == "minimal" else reasoning_effort
+        if hf_level in _HF_ALLOWED_EFFORTS:
+            params["extra_body"] = {"reasoning_effort": hf_level}
+    return params

agent/core/session.py CHANGED Viewed

@@ -18,7 +18,6 @@ logger = logging.getLogger(__name__)
 # Local max-token lookup — avoids litellm.get_max_tokens() which can hang
 # on network calls for certain providers (known litellm issue).
 _MAX_TOKENS_MAP: dict[str, int] = {
-    # Anthropic
     "anthropic/claude-opus-4-6": 200_000,
     "anthropic/claude-opus-4-5-20251101": 200_000,
     "anthropic/claude-sonnet-4-5-20250929": 200_000,
@@ -26,20 +25,32 @@ _MAX_TOKENS_MAP: dict[str, int] = {
     "anthropic/claude-haiku-3-5-20241022": 200_000,
     "anthropic/claude-3-5-sonnet-20241022": 200_000,
     "anthropic/claude-3-opus-20240229": 200_000,
-    "huggingface/fireworks-ai/MiniMaxAI/MiniMax-M2.5": 200_000,
-    "huggingface/novita/minimax/minimax-m2.1": 196_608,
-    "huggingface/novita/moonshotai/kimi-k2.5": 262_144,
-    "huggingface/novita/zai-org/glm-5": 200_000,
 }
 _DEFAULT_MAX_TOKENS = 200_000
 def _get_max_tokens_safe(model_name: str) -> int:
-    """Return the max context window for a model without network calls."""
     tokens = _MAX_TOKENS_MAP.get(model_name)
     if tokens:
         return tokens
-    # Fallback: try litellm but with a short timeout via threading
     try:
         from litellm import get_max_tokens
@@ -49,10 +60,9 @@ def _get_max_tokens_safe(model_name: str) -> int:
         logger.warning(
             f"get_max_tokens returned {result} for {model_name}, using default"
         )
-        return _DEFAULT_MAX_TOKENS
     except Exception as e:
         logger.warning(f"get_max_tokens failed for {model_name}, using default: {e}")
-        return _DEFAULT_MAX_TOKENS
 class OpType(Enum):

 # Local max-token lookup — avoids litellm.get_max_tokens() which can hang
 # on network calls for certain providers (known litellm issue).
 _MAX_TOKENS_MAP: dict[str, int] = {
     "anthropic/claude-opus-4-6": 200_000,
     "anthropic/claude-opus-4-5-20251101": 200_000,
     "anthropic/claude-sonnet-4-5-20250929": 200_000,
     "anthropic/claude-haiku-3-5-20241022": 200_000,
     "anthropic/claude-3-5-sonnet-20241022": 200_000,
     "anthropic/claude-3-opus-20240229": 200_000,
 }
 _DEFAULT_MAX_TOKENS = 200_000
 def _get_max_tokens_safe(model_name: str) -> int:
+    """Return the max context window for a model.
+    Anthropic/OpenAI ids hit the local table; HF router ids ask the catalog
+    (cached) for the max ``context_length`` across live providers. Falls back
+    to ``_DEFAULT_MAX_TOKENS`` if nothing is available.
+    """
     tokens = _MAX_TOKENS_MAP.get(model_name)
     if tokens:
         return tokens
+    if not model_name.startswith(("anthropic/", "openai/")):
+        try:
+            from agent.core import hf_router_catalog as cat
+            bare = model_name.removeprefix("huggingface/").split(":", 1)[0]
+            info = cat.lookup(bare)
+            if info and info.max_context_length:
+                return info.max_context_length
+        except Exception as e:
+            logger.warning("HF catalog lookup failed for %s: %s", model_name, e)
     try:
         from litellm import get_max_tokens
         logger.warning(
             f"get_max_tokens returned {result} for {model_name}, using default"
         )
     except Exception as e:
         logger.warning(f"get_max_tokens failed for {model_name}, using default: {e}")
+    return _DEFAULT_MAX_TOKENS
 class OpType(Enum):

agent/main.py CHANGED Viewed

@@ -44,39 +44,41 @@ from agent.utils.terminal_display import (
 )
 litellm.drop_params = True
 # ── Suggested models shown by `/model` (not a gate) ──────────────────────
-# Any model id accepted by litellm is usable; for the HF router the form is
-# `huggingface/<inference_provider>/<org>/<model>` and users can pick any
-# model supported by any HF inference provider.
 SUGGESTED_MODELS = [
     {"id": "anthropic/claude-opus-4-6", "label": "Claude Opus 4.6"},
-    {"id": "huggingface/fireworks-ai/MiniMaxAI/MiniMax-M2.5", "label": "MiniMax M2.5"},
-    {"id": "huggingface/novita/moonshotai/kimi-k2.5", "label": "Kimi K2.5"},
-    {"id": "huggingface/novita/zai-org/glm-5", "label": "GLM 5"},
 ]
 def _is_valid_model_id(model_id: str) -> bool:
-    """Loose format check — lets users pick any inference-provider model.
     Accepts:
-      • huggingface/<provider>/<org>/<model>  (HF router)
       • anthropic/<model>
       • openai/<model>
-    Actual availability is verified by the provider when the first call
-    is made; we don't want to maintain a hardcoded allowlist.
     """
     if not model_id or "/" not in model_id:
         return False
-    if model_id.startswith("huggingface/"):
-        # needs provider + org + model → at least 3 slashes after the prefix
-        parts = model_id.split("/")
-        return len(parts) >= 4 and all(parts)
-    if model_id.startswith(("anthropic/", "openai/")):
-        parts = model_id.split("/", 1)
-        return len(parts) == 2 and bool(parts[1])
-    return False
 def _safe_get_args(arguments: dict) -> dict:
@@ -88,6 +90,80 @@ def _safe_get_args(arguments: dict) -> dict:
     return args if isinstance(args, dict) else {}
 def _get_hf_token() -> str | None:
     """Get HF token from environment, huggingface_hub API, or cached token file."""
     token = os.environ.get("HF_TOKEN")
@@ -691,35 +767,37 @@ def _handle_slash_command(
         )
     if command == "/model":
         if not arg:
             current = config.model_name if config else ""
-            print("Current model:")
-            print(f"  {current}")
-            print("\nSuggested models (any HF inference-provider model works):")
             for m in SUGGESTED_MODELS:
-                marker = " <-- current" if m["id"] == current else ""
-                print(f"  {m['id']}  ({m['label']}){marker}")
-            print(
-                "\nPass any id, e.g. huggingface/<provider>/<org>/<model>.\n"
-                "Availability is verified on first use."
             )
             return None
         if not _is_valid_model_id(arg):
-            print(f"Invalid model id format: {arg}")
-            print(
-                "Expected one of:\n"
-                "  • huggingface/<provider>/<org>/<model>\n"
                 "  • anthropic/<model>\n"
-                "  • openai/<model>"
             )
             return None
         session = session_holder[0] if session_holder else None
         if session:
-            session.update_model(arg)
-            print(f"Model switched to {arg}")
         else:
-            config.model_name = arg
-            print(f"Model set to {arg} (session not started yet)")
         return None
     if command == "/yolo":
@@ -728,9 +806,31 @@ def _handle_slash_command(
         print(f"YOLO mode: {state}")
         return None
     if command == "/status":
         session = session_holder[0] if session_holder else None
         print(f"Model: {config.model_name}")
         if session:
             print(f"Turns: {session.turn_count}")
             print(f"Context items: {len(session.context_manager.items)}")
@@ -764,6 +864,11 @@ async def main():
     print_banner(hf_user=hf_user)
     # Create queues for communication
     submission_queue = asyncio.Queue()
     event_queue = asyncio.Queue()

 )
 litellm.drop_params = True
+# Suppress the "Give Feedback / Get Help" banner LiteLLM prints to stderr
+# on every error — users don't need it, and our friendly errors cover the case.
+litellm.suppress_debug_info = True
 # ── Suggested models shown by `/model` (not a gate) ──────────────────────
+# Users can paste any HF model id (e.g. "MiniMaxAI/MiniMax-M2.7") or use one
+# of the `anthropic/` / `openai/` prefixes for direct API access. For HF ids,
+# append ":fastest" / ":cheapest" / ":preferred" / ":<provider>" to override
+# the default routing policy (auto = fastest with failover).
 SUGGESTED_MODELS = [
     {"id": "anthropic/claude-opus-4-6", "label": "Claude Opus 4.6"},
+    {"id": "MiniMaxAI/MiniMax-M2.7", "label": "MiniMax M2.7"},
+    {"id": "moonshotai/Kimi-K2.6", "label": "Kimi K2.6"},
+    {"id": "zai-org/GLM-5.1", "label": "GLM 5.1"},
 ]
 def _is_valid_model_id(model_id: str) -> bool:
+    """Loose format check — lets users pick any model id.
     Accepts:
       • anthropic/<model>
       • openai/<model>
+      • <org>/<model>[:<tag>]            (HF router; tag = provider or policy)
+      • huggingface/<org>/<model>[:<tag>] (same, accepts legacy prefix)
+    Actual availability is verified against the HF router catalog on switch,
+    or by the provider on first call.
     """
     if not model_id or "/" not in model_id:
         return False
+    # Strip :tag suffix before structural check
+    head = model_id.split(":", 1)[0]
+    parts = head.split("/")
+    return len(parts) >= 2 and all(parts)
 def _safe_get_args(arguments: dict) -> dict:
     return args if isinstance(args, dict) else {}
+_ROUTING_POLICIES = {"fastest", "cheapest", "preferred"}
+def _print_model_preflight(model_id: str, console) -> None:
+    """Validate a model switch against the HF router catalog and show the
+    user what they're about to use (providers, price, context, tool support).
+    Anthropic/OpenAI ids skip the catalog — those are direct API calls.
+    For unknown HF ids we print a red warning with fuzzy suggestions but
+    still allow the switch (the catalog might be lagging).
+    """
+    if model_id.startswith(("anthropic/", "openai/")):
+        console.print(f"[green]Model switched to {model_id}[/green]")
+        return
+    from agent.core import hf_router_catalog as cat
+    bare, _, tag = model_id.partition(":")
+    info = cat.lookup(bare)
+    if info is None:
+        console.print(
+            f"[bold red]Warning:[/bold red] '{bare}' isn't in the HF router "
+            "catalog. Switching anyway — first call may fail."
+        )
+        suggestions = cat.fuzzy_suggest(bare)
+        if suggestions:
+            console.print(f"[dim]Did you mean: {', '.join(suggestions)}[/dim]")
+        return
+    live = info.live_providers
+    if not live:
+        console.print(
+            f"[bold red]Warning:[/bold red] '{bare}' has no live providers "
+            "right now. First call will likely fail."
+        )
+        return
+    if tag and tag not in _ROUTING_POLICIES:
+        matched = [p for p in live if p.provider == tag]
+        if not matched:
+            names = ", ".join(p.provider for p in live)
+            console.print(
+                f"[bold red]Warning:[/bold red] provider '{tag}' doesn't serve "
+                f"'{bare}'. Live providers: {names}. Switching anyway."
+            )
+            return
+    if not info.any_supports_tools:
+        console.print(
+            f"[bold red]Warning:[/bold red] no provider for '{bare}' advertises "
+            "tool-call support. This agent relies on tool calls — expect errors."
+        )
+    console.print(f"[green]Model switched to {model_id}[/green]")
+    if tag in _ROUTING_POLICIES:
+        policy = tag
+    elif tag:
+        policy = f"pinned to {tag}"
+    else:
+        policy = "auto (fastest)"
+    console.print(f"  [dim]routing: {policy}[/dim]")
+    for p in live:
+        price = (
+            f"${p.input_price:g}/${p.output_price:g} per M tok"
+            if p.input_price is not None and p.output_price is not None
+            else "price n/a"
+        )
+        ctx = f"{p.context_length:,} ctx" if p.context_length else "ctx n/a"
+        tools = "tools" if p.supports_tools else "no tools"
+        console.print(
+            f"  [dim]{p.provider}: {price}, {ctx}, {tools}[/dim]"
+        )
 def _get_hf_token() -> str | None:
     """Get HF token from environment, huggingface_hub API, or cached token file."""
     token = os.environ.get("HF_TOKEN")
         )
     if command == "/model":
+        console = get_console()
         if not arg:
             current = config.model_name if config else ""
+            console.print("[bold]Current model:[/bold]")
+            console.print(f"  {current}")
+            console.print("\n[bold]Suggested:[/bold]")
             for m in SUGGESTED_MODELS:
+                marker = " [dim]<-- current[/dim]" if m["id"] == current else ""
+                console.print(f"  {m['id']}  [dim]({m['label']})[/dim]{marker}")
+            console.print(
+                "\n[dim]Paste any HF model id (e.g. 'MiniMaxAI/MiniMax-M2.7').\n"
+                "Add ':fastest', ':cheapest', ':preferred', or ':<provider>' to override routing.\n"
+                "Use 'anthropic/<model>' or 'openai/<model>' for direct API access.[/dim]"
             )
             return None
         if not _is_valid_model_id(arg):
+            console.print(f"[bold red]Invalid model id format:[/bold red] {arg}")
+            console.print(
+                "[dim]Expected:\n"
+                "  • <org>/<model>[:tag]    (HF router — paste from huggingface.co)\n"
                 "  • anthropic/<model>\n"
+                "  • openai/<model>[/dim]"
             )
             return None
+        normalized = arg.removeprefix("huggingface/")
+        _print_model_preflight(normalized, console)
         session = session_holder[0] if session_holder else None
         if session:
+            session.update_model(normalized)
         else:
+            config.model_name = normalized
         return None
     if command == "/yolo":
         print(f"YOLO mode: {state}")
         return None
+    if command == "/effort":
+        console = get_console()
+        valid = {"minimal", "low", "medium", "high", "off"}
+        if not arg:
+            current = config.reasoning_effort or "off"
+            console.print(f"[bold]Reasoning effort:[/bold] {current}")
+            console.print(
+                "[dim]Set with '/effort minimal|low|medium|high|off'. "
+                "Applies to models that support it (GPT-5 / o-series, Claude "
+                "extended thinking, HF reasoning models); dropped otherwise.[/dim]"
+            )
+            return None
+        level = arg.lower()
+        if level not in valid:
+            console.print(f"[bold red]Invalid level:[/bold red] {arg}")
+            console.print(f"[dim]Expected one of: {', '.join(sorted(valid))}[/dim]")
+            return None
+        config.reasoning_effort = None if level == "off" else level
+        console.print(f"[green]Reasoning effort: {level}[/green]")
+        return None
     if command == "/status":
         session = session_holder[0] if session_holder else None
         print(f"Model: {config.model_name}")
+        print(f"Reasoning effort: {config.reasoning_effort or 'off'}")
         if session:
             print(f"Turns: {session.turn_count}")
             print(f"Context items: {len(session.context_manager.items)}")
     print_banner(hf_user=hf_user)
+    # Pre-warm the HF router catalog in the background so /model switches
+    # don't block on a network fetch.
+    from agent.core import hf_router_catalog
+    asyncio.create_task(asyncio.to_thread(hf_router_catalog.prewarm))
     # Create queues for communication
     submission_queue = asyncio.Queue()
     event_queue = asyncio.Queue()

agent/tools/research_tool.py CHANGED Viewed

@@ -9,12 +9,12 @@ Inspired by claude-code's code-explorer agent pattern.
 import json
 import logging
-import os
 from typing import Any
 from litellm import Message, acompletion
 from agent.core.doom_loop import check_for_doom_loop
 from agent.core.session import Event
 logger = logging.getLogger(__name__)
@@ -213,32 +213,6 @@ RESEARCH_TOOL_SPEC = {
 }
-def _resolve_llm_params(
-    model_name: str, session_hf_token: str | None = None
-) -> dict:
-    """Build LiteLLM kwargs, reusing the HF router logic from agent_loop."""
-    if not model_name.startswith("huggingface/"):
-        return {"model": model_name}
-    parts = model_name.split("/", 2)  # ["huggingface", "<provider>", "<org>/<model>"]
-    if len(parts) < 3:
-        return {"model": model_name}
-    provider = parts[1]
-    model_id = parts[2]
-    api_key = (
-        os.environ.get("INFERENCE_TOKEN")
-        or session_hf_token
-        or os.environ.get("HF_TOKEN")
-        or ""
-    )
-    return {
-        "model": f"openai/{model_id}",
-        "api_base": f"https://router.huggingface.co/{provider}/v3/openai",
-        "api_key": api_key,
-    }
 def _get_research_model(main_model: str) -> str:
     """Pick a cheaper model for research based on the main model."""
     if "anthropic/" in main_model:
@@ -272,7 +246,11 @@ async def research_handler(
     # Use a cheaper/faster model for research
     main_model = session.config.model_name
     research_model = _get_research_model(main_model)
-    llm_params = _resolve_llm_params(research_model, getattr(session, "hf_token", None))
     # Get read-only tool specs from the session's tool router
     tool_specs = [

 import json
 import logging
 from typing import Any
 from litellm import Message, acompletion
 from agent.core.doom_loop import check_for_doom_loop
+from agent.core.llm_params import _resolve_llm_params
 from agent.core.session import Event
 logger = logging.getLogger(__name__)
 }
 def _get_research_model(main_model: str) -> str:
     """Pick a cheaper model for research based on the main model."""
     if "anthropic/" in main_model:
     # Use a cheaper/faster model for research
     main_model = session.config.model_name
     research_model = _get_research_model(main_model)
+    llm_params = _resolve_llm_params(
+        research_model,
+        getattr(session, "hf_token", None),
+        reasoning_effort=getattr(session.config, "reasoning_effort", None),
+    )
     # Get read-only tool specs from the session's tool router
     tool_specs = [

agent/utils/terminal_display.py CHANGED Viewed

@@ -318,6 +318,7 @@ HELP_TEXT = f"""\
 {_I}  [cyan]/undo[/cyan]            Undo last turn
 {_I}  [cyan]/compact[/cyan]         Compact context window
 {_I}  [cyan]/model[/cyan] [id]      Show available models or switch
 {_I}  [cyan]/yolo[/cyan]            Toggle auto-approve mode
 {_I}  [cyan]/status[/cyan]          Current model & turn count
 {_I}  [cyan]/quit[/cyan]            Exit"""

 {_I}  [cyan]/undo[/cyan]            Undo last turn
 {_I}  [cyan]/compact[/cyan]         Compact context window
 {_I}  [cyan]/model[/cyan] [id]      Show available models or switch
+{_I}  [cyan]/effort[/cyan] [level]  Reasoning effort (minimal|low|medium|high|off)
 {_I}  [cyan]/yolo[/cyan]            Toggle auto-approve mode
 {_I}  [cyan]/status[/cyan]          Current model & turn count
 {_I}  [cyan]/quit[/cyan]            Exit"""

backend/routes/agent.py CHANGED Viewed

@@ -30,7 +30,7 @@ from models import (
 )
 from session_manager import MAX_SESSIONS, SessionCapacityError, session_manager
-from agent.core.agent_loop import _resolve_hf_router_params
 logger = logging.getLogger(__name__)
@@ -44,19 +44,19 @@ AVAILABLE_MODELS = [
         "recommended": True,
     },
     {
-        "id": "huggingface/fireworks-ai/MiniMaxAI/MiniMax-M2.5",
-        "label": "MiniMax M2.5",
         "provider": "huggingface",
         "recommended": True,
     },
     {
-        "id": "huggingface/novita/moonshotai/kimi-k2.5",
-        "label": "Kimi K2.5",
         "provider": "huggingface",
     },
     {
-        "id": "huggingface/novita/zai-org/glm-5",
-        "label": "GLM 5",
         "provider": "huggingface",
     },
 ]
@@ -93,7 +93,7 @@ async def llm_health_check() -> LLMHealthResponse:
     """
     model = session_manager.config.model_name
     try:
-        llm_params = _resolve_hf_router_params(model)
         await acompletion(
             messages=[{"role": "user", "content": "hi"}],
             max_tokens=1,
@@ -163,7 +163,7 @@ async def generate_title(
 ) -> dict:
     """Generate a short title for a chat session based on the first user message."""
     model = session_manager.config.model_name
-    llm_params = _resolve_hf_router_params(model)
     try:
         response = await acompletion(
             messages=[

 )
 from session_manager import MAX_SESSIONS, SessionCapacityError, session_manager
+from agent.core.llm_params import _resolve_llm_params
 logger = logging.getLogger(__name__)
         "recommended": True,
     },
     {
+        "id": "MiniMaxAI/MiniMax-M2.7",
+        "label": "MiniMax M2.7",
         "provider": "huggingface",
         "recommended": True,
     },
     {
+        "id": "moonshotai/Kimi-K2.6",
+        "label": "Kimi K2.6",
         "provider": "huggingface",
     },
     {
+        "id": "zai-org/GLM-5.1",
+        "label": "GLM 5.1",
         "provider": "huggingface",
     },
 ]
     """
     model = session_manager.config.model_name
     try:
+        llm_params = _resolve_llm_params(model, reasoning_effort="high")
         await acompletion(
             messages=[{"role": "user", "content": "hi"}],
             max_tokens=1,
 ) -> dict:
     """Generate a short title for a chat session based on the first user message."""
     model = session_manager.config.model_name
+    llm_params = _resolve_llm_params(model, reasoning_effort="high")
     try:
         response = await acompletion(
             messages=[

frontend/src/components/Chat/ChatInput.tsx CHANGED Viewed

@@ -30,26 +30,26 @@ const MODEL_OPTIONS: ModelOption[] = [
     recommended: true,
   },
   {
-    id: 'minimax-m2.5',
-    name: 'MiniMax M2.5',
-    description: 'Via Fireworks',
-    modelPath: 'huggingface/fireworks-ai/MiniMaxAI/MiniMax-M2.5',
-    avatarUrl: getHfAvatarUrl('MiniMaxAI/MiniMax-M2.5'),
     recommended: true,
   },
   {
-    id: 'kimi-k2.5',
-    name: 'Kimi K2.5',
-    description: 'Via Novita',
-    modelPath: 'huggingface/novita/moonshotai/kimi-k2.5',
-    avatarUrl: getHfAvatarUrl('moonshotai/Kimi-K2.5'),
   },
   {
-    id: 'glm-5',
-    name: 'GLM 5',
-    description: 'Via Novita',
-    modelPath: 'huggingface/novita/zai-org/glm-5',
-    avatarUrl: getHfAvatarUrl('zai-org/GLM-5'),
   },
 ];

     recommended: true,
   },
   {
+    id: 'minimax-m2.7',
+    name: 'MiniMax M2.7',
+    description: 'HF auto-routed',
+    modelPath: 'MiniMaxAI/MiniMax-M2.7',
+    avatarUrl: getHfAvatarUrl('MiniMaxAI/MiniMax-M2.7'),
     recommended: true,
   },
   {
+    id: 'kimi-k2.6',
+    name: 'Kimi K2.6',
+    description: 'HF auto-routed',
+    modelPath: 'moonshotai/Kimi-K2.6',
+    avatarUrl: getHfAvatarUrl('moonshotai/Kimi-K2.6'),
   },
   {
+    id: 'glm-5.1',
+    name: 'GLM 5.1',
+    description: 'HF auto-routed',
+    modelPath: 'zai-org/GLM-5.1',
+    avatarUrl: getHfAvatarUrl('zai-org/GLM-5.1'),
   },
 ];