Spaces:

smolagents
/

ml-intern

Running on CPU Upgrade

App Files Files Community

akseljoonas commited on Apr 23

Commit

2a1effd

2 Parent(s): 0718170 540437a

Merge remote-tracking branch 'github/main' into space-main

Browse files

Files changed (23) hide show

agent/config.py +9 -8
agent/context_manager/manager.py +5 -0
agent/core/agent_loop.py +77 -1
agent/core/effort_probe.py +229 -0
agent/core/llm_params.py +139 -24
agent/core/model_switcher.py +228 -0
agent/core/prompt_caching.py +59 -0
agent/core/session.py +23 -0
agent/main.py +31 -139
agent/tools/research_tool.py +17 -5
agent/utils/terminal_display.py +1 -1
backend/dependencies.py +109 -3
backend/routes/agent.py +150 -26
backend/session_manager.py +16 -1
backend/user_quotas.py +83 -0
frontend/src/components/Chat/ChatInput.tsx +100 -8
frontend/src/components/ClaudeCapDialog.tsx +134 -0
frontend/src/hooks/useAgentChat.ts +9 -1
frontend/src/hooks/useUserQuota.ts +51 -0
frontend/src/lib/sse-chat-transport.ts +6 -0
frontend/src/store/agentStore.ts +5 -0
frontend/src/utils/model.ts +15 -0
tests/unit/test_user_quotas.py +116 -0

agent/config.py CHANGED Viewed

@@ -33,14 +33,15 @@ class Config(BaseModel):
     confirm_cpu_jobs: bool = True
     auto_file_upload: bool = False
-    # Reasoning effort for models that support it (GPT-5 / o-series, Claude
-    # extended thinking, HF reasoning models like MiniMax M2 / Kimi K2).
-    # Defaults to "high" — we'd rather spend tokens thinking than ship a
-    # wrong ML recipe. Users can dial down with `/effort low|medium|off`.
-    # "minimal" is an OpenAI-only level and is normalized to "low" for HF
-    # router models (MiniMax requires ≥low). Ignored for non-reasoning models.
-    # Valid values: None | "minimal" | "low" | "medium" | "high"
-    reasoning_effort: str | None = "high"
 def substitute_env_vars(obj: Any) -> Any:

     confirm_cpu_jobs: bool = True
     auto_file_upload: bool = False
+    # Reasoning effort *preference* — the ceiling the user wants. The probe
+    # on `/model` walks a cascade down from here (``max`` → ``xhigh`` → ``high``
+    # → …) and caches per-model what the provider actually accepted in
+    # ``Session.model_effective_effort``. Default ``max`` because we'd rather
+    # burn tokens thinking than ship a wrong ML recipe; the cascade lands on
+    # whichever level the model supports (``high`` for GPT-5 / HF router,
+    # ``xhigh`` or ``max`` for Anthropic 4.6 / 4.7). ``None`` = thinking off.
+    # Valid values: None | "minimal" | "low" | "medium" | "high" | "xhigh" | "max"
+    reasoning_effort: str | None = "max"
 def substitute_env_vars(obj: Any) -> Any:

agent/context_manager/manager.py CHANGED Viewed

@@ -13,6 +13,8 @@ import yaml
 from jinja2 import Template
 from litellm import Message, acompletion
 logger = logging.getLogger(__name__)
 _HF_WHOAMI_URL = "https://huggingface.co/api/whoami-v2"
@@ -114,6 +116,9 @@ async def summarize_messages(
     prompt_messages = list(messages) + [Message(role="user", content=prompt)]
     llm_params = _resolve_llm_params(model_name, hf_token, reasoning_effort="high")
     response = await acompletion(
         messages=prompt_messages,
         max_completion_tokens=max_tokens,

 from jinja2 import Template
 from litellm import Message, acompletion
+from agent.core.prompt_caching import with_prompt_caching
 logger = logging.getLogger(__name__)
 _HF_WHOAMI_URL = "https://huggingface.co/api/whoami-v2"
     prompt_messages = list(messages) + [Message(role="user", content=prompt)]
     llm_params = _resolve_llm_params(model_name, hf_token, reasoning_effort="high")
+    prompt_messages, tool_specs = with_prompt_caching(
+        prompt_messages, tool_specs, llm_params.get("model")
+    )
     response = await acompletion(
         messages=prompt_messages,
         max_completion_tokens=max_tokens,

agent/core/agent_loop.py CHANGED Viewed

@@ -14,6 +14,7 @@ from litellm.exceptions import ContextWindowExceededError
 from agent.config import Config
 from agent.core.doom_loop import check_for_doom_loop
 from agent.core.llm_params import _resolve_llm_params
 from agent.core.session import Event, OpType, Session
 from agent.core.tools import ToolRouter
 from agent.tools.jobs_tool import CPU_FLAVORS
@@ -136,6 +137,58 @@ def _is_transient_error(error: Exception) -> bool:
     return any(pattern in err_str for pattern in transient_patterns)
 def _friendly_error_message(error: Exception) -> str | None:
     """Return a user-friendly message for known error types, or None to fall back to traceback."""
     err_str = str(error).lower()
@@ -243,6 +296,8 @@ class LLMResult:
 async def _call_llm_streaming(session: Session, messages, tools, llm_params) -> LLMResult:
     """Call the LLM with streaming, emitting assistant_chunk events."""
     response = None
     for _llm_attempt in range(_MAX_LLM_RETRIES):
         try:
             response = await acompletion(
@@ -258,6 +313,14 @@ async def _call_llm_streaming(session: Session, messages, tools, llm_params) ->
         except ContextWindowExceededError:
             raise
         except Exception as e:
             if _llm_attempt < _MAX_LLM_RETRIES - 1 and _is_transient_error(e):
                 _delay = _LLM_RETRY_DELAYS[_llm_attempt]
                 logger.warning(
@@ -328,6 +391,8 @@ async def _call_llm_streaming(session: Session, messages, tools, llm_params) ->
 async def _call_llm_non_streaming(session: Session, messages, tools, llm_params) -> LLMResult:
     """Call the LLM without streaming, emit assistant_message at the end."""
     response = None
     for _llm_attempt in range(_MAX_LLM_RETRIES):
         try:
             response = await acompletion(
@@ -342,6 +407,14 @@ async def _call_llm_non_streaming(session: Session, messages, tools, llm_params)
         except ContextWindowExceededError:
             raise
         except Exception as e:
             if _llm_attempt < _MAX_LLM_RETRIES - 1 and _is_transient_error(e):
                 _delay = _LLM_RETRY_DELAYS[_llm_attempt]
                 logger.warning(
@@ -490,10 +563,13 @@ class Handlers:
             tools = session.tool_router.get_tool_specs_for_llm()
             try:
                 # ── Call the LLM (streaming or non-streaming) ──
                 llm_params = _resolve_llm_params(
                     session.config.model_name,
                     session.hf_token,
-                    reasoning_effort=session.config.reasoning_effort,
                 )
                 if session.stream:
                     llm_result = await _call_llm_streaming(session, messages, tools, llm_params)

 from agent.config import Config
 from agent.core.doom_loop import check_for_doom_loop
 from agent.core.llm_params import _resolve_llm_params
+from agent.core.prompt_caching import with_prompt_caching
 from agent.core.session import Event, OpType, Session
 from agent.core.tools import ToolRouter
 from agent.tools.jobs_tool import CPU_FLAVORS
     return any(pattern in err_str for pattern in transient_patterns)
+def _is_effort_config_error(error: Exception) -> bool:
+    """Catch the two 400s the effort probe also handles — thinking
+    unsupported for this model, or the specific effort level invalid.
+    This is our safety net for the case where ``/effort`` was changed
+    mid-conversation (which clears the probe cache) and the new level
+    doesn't work for the current model. We heal the cache and retry once.
+    """
+    from agent.core.effort_probe import _is_invalid_effort, _is_thinking_unsupported
+    return _is_thinking_unsupported(error) or _is_invalid_effort(error)
+async def _heal_effort_and_rebuild_params(
+    session: Session, error: Exception, llm_params: dict,
+) -> dict:
+    """Update the session's effort cache based on ``error`` and return new
+    llm_params. Called only when ``_is_effort_config_error(error)`` is True.
+    Two branches:
+      • thinking-unsupported → cache ``None`` for this model, next call
+        strips thinking entirely
+      • invalid-effort → re-run the full cascade probe; the result lands
+        in the cache
+    """
+    from agent.core.effort_probe import ProbeInconclusive, _is_thinking_unsupported, probe_effort
+    model = session.config.model_name
+    if _is_thinking_unsupported(error):
+        session.model_effective_effort[model] = None
+        logger.info("healed: %s doesn't support thinking — stripped", model)
+    else:
+        try:
+            outcome = await probe_effort(
+                model, session.config.reasoning_effort, session.hf_token,
+            )
+            session.model_effective_effort[model] = outcome.effective_effort
+            logger.info(
+                "healed: %s effort cascade → %s", model, outcome.effective_effort,
+            )
+        except ProbeInconclusive:
+            # Transient during healing — strip thinking for safety, next
+            # call will either succeed or surface the real error.
+            session.model_effective_effort[model] = None
+            logger.info("healed: %s probe inconclusive — stripped", model)
+    return _resolve_llm_params(
+        model,
+        session.hf_token,
+        reasoning_effort=session.effective_effort_for(model),
+    )
 def _friendly_error_message(error: Exception) -> str | None:
     """Return a user-friendly message for known error types, or None to fall back to traceback."""
     err_str = str(error).lower()
 async def _call_llm_streaming(session: Session, messages, tools, llm_params) -> LLMResult:
     """Call the LLM with streaming, emitting assistant_chunk events."""
     response = None
+    _healed_effort = False  # one-shot safety net per call
+    messages, tools = with_prompt_caching(messages, tools, llm_params.get("model"))
     for _llm_attempt in range(_MAX_LLM_RETRIES):
         try:
             response = await acompletion(
         except ContextWindowExceededError:
             raise
         except Exception as e:
+            if not _healed_effort and _is_effort_config_error(e):
+                _healed_effort = True
+                llm_params = await _heal_effort_and_rebuild_params(session, e, llm_params)
+                await session.send_event(Event(
+                    event_type="tool_log",
+                    data={"tool": "system", "log": "Reasoning effort not supported for this model — adjusting and retrying."},
+                ))
+                continue
             if _llm_attempt < _MAX_LLM_RETRIES - 1 and _is_transient_error(e):
                 _delay = _LLM_RETRY_DELAYS[_llm_attempt]
                 logger.warning(
 async def _call_llm_non_streaming(session: Session, messages, tools, llm_params) -> LLMResult:
     """Call the LLM without streaming, emit assistant_message at the end."""
     response = None
+    _healed_effort = False
+    messages, tools = with_prompt_caching(messages, tools, llm_params.get("model"))
     for _llm_attempt in range(_MAX_LLM_RETRIES):
         try:
             response = await acompletion(
         except ContextWindowExceededError:
             raise
         except Exception as e:
+            if not _healed_effort and _is_effort_config_error(e):
+                _healed_effort = True
+                llm_params = await _heal_effort_and_rebuild_params(session, e, llm_params)
+                await session.send_event(Event(
+                    event_type="tool_log",
+                    data={"tool": "system", "log": "Reasoning effort not supported for this model — adjusting and retrying."},
+                ))
+                continue
             if _llm_attempt < _MAX_LLM_RETRIES - 1 and _is_transient_error(e):
                 _delay = _LLM_RETRY_DELAYS[_llm_attempt]
                 logger.warning(
             tools = session.tool_router.get_tool_specs_for_llm()
             try:
                 # ── Call the LLM (streaming or non-streaming) ──
+                # Pull the per-model probed effort from the session cache when
+                # available; fall back to the raw preference for models we
+                # haven't probed yet (e.g. research sub-model).
                 llm_params = _resolve_llm_params(
                     session.config.model_name,
                     session.hf_token,
+                    reasoning_effort=session.effective_effort_for(session.config.model_name),
                 )
                 if session.stream:
                     llm_result = await _call_llm_streaming(session, messages, tools, llm_params)

agent/core/effort_probe.py ADDED Viewed

	@@ -0,0 +1,229 @@

+"""Probe-and-cascade for reasoning effort on /model switch.
+We don't maintain a per-model capability table. Instead, the first time a
+user picks a model we fire a 1-token ping with the same params we'd use
+for real and walk down a cascade (``max`` → ``xhigh`` → ``high`` → …)
+until the provider stops rejecting us. The result is cached per-model on
+the session, so real messages don't pay the probe cost again.
+Three outcomes, classified from the 400 error text:
+* success → cache the effort that worked
+* ``"thinking ... not supported"`` → model doesn't do thinking at all;
+  cache ``None`` so we stop sending thinking params
+* ``"effort ... invalid"`` / synonyms → cascade walks down and retries
+Transient errors (5xx, timeout, connection reset) bubble out as
+``ProbeInconclusive`` so the caller can complete the switch with a
+warning instead of blocking on a flaky provider.
+"""
+from __future__ import annotations
+import asyncio
+import logging
+from dataclasses import dataclass
+from litellm import acompletion
+from agent.core.llm_params import UnsupportedEffortError, _resolve_llm_params
+logger = logging.getLogger(__name__)
+# Cascade: for each user-stated preference, the ordered list of levels to
+# try. First success wins. ``max`` / ``xhigh`` are Anthropic-only; providers
+# that don't accept them raise ``UnsupportedEffortError`` synchronously (no
+# wasted network round-trip) and we advance to the next level.
+_EFFORT_CASCADE: dict[str, list[str]] = {
+    "max":     ["max", "xhigh", "high", "medium", "low"],
+    "xhigh":   ["xhigh", "high", "medium", "low"],
+    "high":    ["high", "medium", "low"],
+    "medium":  ["medium", "low"],
+    "minimal": ["minimal", "low"],
+    "low":     ["low"],
+}
+_PROBE_TIMEOUT = 15.0
+_PROBE_MAX_TOKENS = 16
+class ProbeInconclusive(Exception):
+    """The probe couldn't reach a verdict (transient network / provider error).
+    Caller should complete the switch with a warning — the next real call
+    will re-surface the error if it's persistent.
+    """
+@dataclass
+class ProbeOutcome:
+    """What the probe learned. ``effective_effort`` semantics match the cache:
+    * str → send this level
+    * None → model doesn't support thinking; strip it
+    """
+    effective_effort: str | None
+    attempts: int
+    elapsed_ms: int
+    note: str | None = None  # e.g. "max not supported, falling back"
+def _is_thinking_unsupported(e: Exception) -> bool:
+    """Model rejected any thinking config.
+    Matches Anthropic's 'thinking.type.enabled is not supported for this
+    model' as well as the adaptive variant. Substring-match because the
+    exact wording shifts across API versions.
+    """
+    s = str(e).lower()
+    return "thinking" in s and "not supported" in s
+def _is_invalid_effort(e: Exception) -> bool:
+    """The requested effort level isn't accepted for this model.
+    Covers both API responses (Anthropic/OpenAI 400 with "invalid", "must
+    be one of", etc.) and LiteLLM's local validation that fires *before*
+    the request (e.g. "effort='max' is only supported by Claude Opus 4.6"
+    — LiteLLM knows max is Opus-4.6-only and raises synchronously). The
+    cascade walks down on either.
+    Explicitly returns False when the message is really about thinking
+    itself (e.g. Anthropic's 4.7 error mentions ``output_config.effort``
+    in its fix hint, but the actual failure is ``thinking.type.enabled``
+    being unsupported). That case is caught by ``_is_thinking_unsupported``.
+    """
+    if _is_thinking_unsupported(e):
+        return False
+    s = str(e).lower()
+    if "effort" not in s and "output_config" not in s:
+        return False
+    return any(
+        phrase in s
+        for phrase in (
+            "invalid", "not supported", "must be one of", "not a valid",
+            "unrecognized", "unknown",
+            # LiteLLM's own pre-flight validation phrasing.
+            "only supported by", "is only supported",
+        )
+    )
+def _is_transient(e: Exception) -> bool:
+    """Network / provider-side flake. Keep in sync with agent_loop's list.
+    Also matches by type for ``asyncio.TimeoutError`` — its ``str(e)`` is
+    empty, so substring matching alone misses it.
+    """
+    if isinstance(e, (asyncio.TimeoutError, TimeoutError)):
+        return True
+    s = str(e).lower()
+    return any(
+        p in s
+        for p in (
+            "timeout", "timed out", "429", "rate limit",
+            "503", "service unavailable", "502", "bad gateway",
+            "500", "internal server error", "overloaded", "capacity",
+            "connection reset", "connection refused", "connection error",
+            "eof", "broken pipe",
+        )
+    )
+async def probe_effort(
+    model_name: str,
+    preference: str | None,
+    hf_token: str | None,
+) -> ProbeOutcome:
+    """Walk the cascade for ``preference`` on ``model_name``.
+    Returns the first effort the provider accepts, or ``None`` if it
+    rejects thinking altogether. Raises ``ProbeInconclusive`` only for
+    transient errors (5xx, timeout) — persistent 4xx that aren't thinking/
+    effort related bubble as the original exception so callers can surface
+    them (auth, model-not-found, quota, etc.).
+    """
+    loop = asyncio.get_event_loop()
+    start = loop.time()
+    attempts = 0
+    if not preference:
+        # User explicitly turned effort off — nothing to probe. A bare
+        # ping with no thinking params is pointless; just report "off".
+        return ProbeOutcome(effective_effort=None, attempts=0, elapsed_ms=0)
+    cascade = _EFFORT_CASCADE.get(preference, [preference])
+    skipped: list[str] = []  # levels the provider rejected synchronously
+    last_error: Exception | None = None
+    for effort in cascade:
+        try:
+            params = _resolve_llm_params(
+                model_name, hf_token, reasoning_effort=effort, strict=True,
+            )
+        except UnsupportedEffortError:
+            # Provider can't even accept this effort name (e.g. "max" on
+            # HF router). Skip without a network call.
+            skipped.append(effort)
+            continue
+        attempts += 1
+        try:
+            await asyncio.wait_for(
+                acompletion(
+                    messages=[{"role": "user", "content": "ping"}],
+                    max_tokens=_PROBE_MAX_TOKENS,
+                    stream=False,
+                    **params,
+                ),
+                timeout=_PROBE_TIMEOUT,
+            )
+        except Exception as e:
+            last_error = e
+            if _is_thinking_unsupported(e):
+                elapsed = int((loop.time() - start) * 1000)
+                return ProbeOutcome(
+                    effective_effort=None,
+                    attempts=attempts,
+                    elapsed_ms=elapsed,
+                    note="model doesn't support reasoning, dropped",
+                )
+            if _is_invalid_effort(e):
+                logger.debug("probe: %s rejected effort=%s, trying next", model_name, effort)
+                continue
+            if _is_transient(e):
+                raise ProbeInconclusive(str(e)) from e
+            # Persistent non-thinking 4xx (auth, quota, model-not-found) —
+            # let the caller classify & surface.
+            raise
+        else:
+            elapsed = int((loop.time() - start) * 1000)
+            note = None
+            if effort != preference:
+                note = f"{preference} not supported, using {effort}"
+            return ProbeOutcome(
+                effective_effort=effort,
+                attempts=attempts,
+                elapsed_ms=elapsed,
+                note=note,
+            )
+    # Cascade exhausted without a success. This only happens when every
+    # level was either rejected synchronously (``UnsupportedEffortError``,
+    # e.g. preference=max on HF and we also somehow filtered all others)
+    # or the provider 400'd ``invalid effort`` on every level.
+    elapsed = int((loop.time() - start) * 1000)
+    if last_error is not None and not _is_invalid_effort(last_error):
+        raise last_error
+    note = (
+        "no effort level accepted — proceeding without thinking"
+        if not skipped
+        else f"provider rejected all efforts ({', '.join(skipped)})"
+    )
+    return ProbeOutcome(
+        effective_effort=None,
+        attempts=attempts,
+        elapsed_ms=elapsed,
+        note=note,
+    )

agent/core/llm_params.py CHANGED Viewed

@@ -8,41 +8,122 @@ creating circular imports.
 import os
-# HF router reasoning models only accept "low" | "medium" | "high" (e.g.
-# MiniMax M2 actually *requires* reasoning to be enabled). OpenAI's GPT-5
-# also accepts "minimal" for near-zero thinking. We map "minimal" to "low"
-# for HF so the user doesn't get a 400.
-_HF_ALLOWED_EFFORTS = {"low", "medium", "high"}
 def _resolve_llm_params(
     model_name: str,
     session_hf_token: str | None = None,
     reasoning_effort: str | None = None,
 ) -> dict:
     """
     Build LiteLLM kwargs for a given model id.
-    • ``anthropic/<model>`` / ``openai/<model>`` — passed straight through; the
-      user's own ``ANTHROPIC_API_KEY`` / ``OPENAI_API_KEY`` env vars are picked
-      up by LiteLLM. ``reasoning_effort`` is forwarded as a top-level param
-      (GPT-5 / o-series accept "minimal" | "low" | "medium" | "high"; Claude
-      extended-thinking models accept "low" | "medium" | "high" and LiteLLM
-      translates to the thinking config).
     • Anything else is treated as a HuggingFace router id. We hit the
       auto-routing OpenAI-compatible endpoint at
-      ``https://router.huggingface.co/v1``, which bypasses LiteLLM's stale
-      per-provider HF adapter entirely. The id can be bare or carry an HF
-      routing suffix:
-          MiniMaxAI/MiniMax-M2.7              # auto = fastest + failover
-          MiniMaxAI/MiniMax-M2.7:cheapest
-          moonshotai/Kimi-K2.6:novita         # pin a specific provider
-      A leading ``huggingface/`` is stripped for convenience. ``reasoning_effort``
-      is forwarded via ``extra_body`` (LiteLLM's OpenAI adapter refuses it as a
-      top-level kwarg for non-OpenAI models). "minimal" is normalized to "low".
     Token precedence (first non-empty wins):
       1. INFERENCE_TOKEN env — shared key on the hosted Space (inference is
@@ -50,10 +131,39 @@ def _resolve_llm_params(
       2. session.hf_token — the user's own token (CLI / OAuth / cache file).
       3. HF_TOKEN env — belt-and-suspenders fallback for CLI users.
     """
-    if model_name.startswith(("anthropic/", "openai/")):
         params: dict = {"model": model_name}
         if reasoning_effort:
-            params["reasoning_effort"] = reasoning_effort
         return params
     hf_model = model_name.removeprefix("huggingface/")
@@ -72,6 +182,11 @@ def _resolve_llm_params(
         params["extra_headers"] = {"X-HF-Bill-To": bill_to}
     if reasoning_effort:
         hf_level = "low" if reasoning_effort == "minimal" else reasoning_effort
-        if hf_level in _HF_ALLOWED_EFFORTS:
             params["extra_body"] = {"reasoning_effort": hf_level}
     return params

 import os
+def _patch_litellm_effort_validation() -> None:
+    """Neuter LiteLLM 1.83's hardcoded effort-level validation.
+    Context: at ``litellm/llms/anthropic/chat/transformation.py:~1443`` the
+    Anthropic adapter validates ``output_config.effort ∈ {high, medium,
+    low, max}`` and gates ``max`` behind an ``_is_opus_4_6_model`` check
+    that only matches the substring ``opus-4-6`` / ``opus_4_6``. Result:
+    * ``xhigh`` — valid on Anthropic's real API for Claude 4.7 — is
+      rejected pre-flight with "Invalid effort value: xhigh".
+    * ``max`` on Opus 4.7 is rejected with "effort='max' is only supported
+      by Claude Opus 4.6", even though Opus 4.7 accepts it in practice.
+    We don't want to maintain a parallel model table, so we let the
+    Anthropic API itself be the validator: widen ``_is_opus_4_6_model``
+    to also match ``opus-4-7``+ families, and drop the valid-effort-set
+    check entirely. If Anthropic rejects an effort level, we see a 400
+    and the cascade walks down — exactly the behavior we want for any
+    future model family.
+    Removable once litellm ships 1.83.8-stable (which merges PR #25867,
+    "Litellm day 0 opus 4.7 support") — see commit 0868a82 on their main
+    branch. Until then, this one-time patch is the escape hatch.
+    """
+    try:
+        from litellm.llms.anthropic.chat import transformation as _t
+    except Exception:
+        return
+    cfg = getattr(_t, "AnthropicConfig", None)
+    if cfg is None:
+        return
+    original = getattr(cfg, "_is_opus_4_6_model", None)
+    if original is None or getattr(original, "_hf_agent_patched", False):
+        return
+    def _widened(model: str) -> bool:
+        m = model.lower()
+        # Original 4.6 match plus any future Opus >= 4.6. We only need this
+        # to return True for families where "max" / "xhigh" are acceptable
+        # at the API; the cascade handles the case when they're not.
+        return any(
+            v in m for v in (
+                "opus-4-6", "opus_4_6", "opus-4.6", "opus_4.6",
+                "opus-4-7", "opus_4_7", "opus-4.7", "opus_4.7",
+            )
+        )
+    _widened._hf_agent_patched = True  # type: ignore[attr-defined]
+    cfg._is_opus_4_6_model = staticmethod(_widened)
+_patch_litellm_effort_validation()
+# Effort levels accepted on the wire.
+#   Anthropic (4.6+):  low | medium | high | xhigh | max   (output_config.effort)
+#   OpenAI direct:     minimal | low | medium | high       (reasoning_effort top-level)
+#   HF router:         low | medium | high                 (extra_body.reasoning_effort)
+#
+# We validate *shape* here and let the probe cascade walk down on rejection;
+# we deliberately do NOT maintain a per-model capability table.
+_ANTHROPIC_EFFORTS = {"low", "medium", "high", "xhigh", "max"}
+_OPENAI_EFFORTS = {"minimal", "low", "medium", "high"}
+_HF_EFFORTS = {"low", "medium", "high"}
+class UnsupportedEffortError(ValueError):
+    """The requested effort isn't valid for this provider's API surface.
+    Raised synchronously before any network call so the probe cascade can
+    skip levels the provider can't accept (e.g. ``max`` on HF router).
+    """
 def _resolve_llm_params(
     model_name: str,
     session_hf_token: str | None = None,
     reasoning_effort: str | None = None,
+    strict: bool = False,
 ) -> dict:
     """
     Build LiteLLM kwargs for a given model id.
+    • ``anthropic/<model>`` — native thinking config. We bypass LiteLLM's
+      ``reasoning_effort`` → ``thinking`` mapping (which lags new Claude
+      releases like 4.7 and sends the wrong API shape). Instead we pass
+      both ``thinking={"type": "adaptive"}`` and ``output_config=
+      {"effort": <level>}`` as top-level kwargs — LiteLLM's Anthropic
+      adapter forwards unknown top-level kwargs into the request body
+      verbatim (confirmed by live probe; ``extra_body`` does NOT work
+      here because Anthropic's API rejects it as "Extra inputs are not
+      permitted"). This is the stable API for 4.6 and 4.7. Older
+      extended-thinking models that only accept ``thinking.type.enabled``
+      will reject this; the probe's cascade catches that and falls back
+      to no thinking.
+    • ``openai/<model>`` — ``reasoning_effort`` forwarded as a top-level
+      kwarg (GPT-5 / o-series). LiteLLM uses the user's ``OPENAI_API_KEY``.
     • Anything else is treated as a HuggingFace router id. We hit the
       auto-routing OpenAI-compatible endpoint at
+      ``https://router.huggingface.co/v1``. The id can be bare or carry an
+      HF routing suffix (``:fastest`` / ``:cheapest`` / ``:<provider>``).
+      A leading ``huggingface/`` is stripped. ``reasoning_effort`` is
+      forwarded via ``extra_body`` (LiteLLM's OpenAI adapter refuses it as
+      a top-level kwarg for non-OpenAI models). "minimal" normalizes to
+      "low".
+    ``strict=True`` raises ``UnsupportedEffortError`` when the requested
+    effort isn't in the provider's accepted set, instead of silently
+    dropping it. The probe cascade uses strict mode so it can walk down
+    (``max`` → ``xhigh`` → ``high`` …) without making an API call. Regular
+    runtime callers leave ``strict=False``, so a stale cached effort
+    can't crash a turn — it just doesn't get sent.
     Token precedence (first non-empty wins):
       1. INFERENCE_TOKEN env — shared key on the hosted Space (inference is
       2. session.hf_token — the user's own token (CLI / OAuth / cache file).
       3. HF_TOKEN env — belt-and-suspenders fallback for CLI users.
     """
+    if model_name.startswith("anthropic/"):
         params: dict = {"model": model_name}
         if reasoning_effort:
+            level = reasoning_effort
+            if level == "minimal":
+                level = "low"
+            if level not in _ANTHROPIC_EFFORTS:
+                if strict:
+                    raise UnsupportedEffortError(
+                        f"Anthropic doesn't accept effort={level!r}"
+                    )
+            else:
+                # Adaptive thinking + output_config.effort is the stable
+                # Anthropic API for Claude 4.6 / 4.7. Both kwargs are
+                # passed top-level: LiteLLM forwards unknown params into
+                # the request body for Anthropic, so ``output_config``
+                # reaches the API. ``extra_body`` does NOT work here —
+                # Anthropic rejects it as "Extra inputs are not
+                # permitted".
+                params["thinking"] = {"type": "adaptive"}
+                params["output_config"] = {"effort": level}
+        return params
+    if model_name.startswith("openai/"):
+        params = {"model": model_name}
+        if reasoning_effort:
+            if reasoning_effort not in _OPENAI_EFFORTS:
+                if strict:
+                    raise UnsupportedEffortError(
+                        f"OpenAI doesn't accept effort={reasoning_effort!r}"
+                    )
+            else:
+                params["reasoning_effort"] = reasoning_effort
         return params
     hf_model = model_name.removeprefix("huggingface/")
         params["extra_headers"] = {"X-HF-Bill-To": bill_to}
     if reasoning_effort:
         hf_level = "low" if reasoning_effort == "minimal" else reasoning_effort
+        if hf_level not in _HF_EFFORTS:
+            if strict:
+                raise UnsupportedEffortError(
+                    f"HF router doesn't accept effort={hf_level!r}"
+                )
+        else:
             params["extra_body"] = {"reasoning_effort": hf_level}
     return params

agent/core/model_switcher.py ADDED Viewed

	@@ -0,0 +1,228 @@

+"""Model-switching logic for the interactive CLI's ``/model`` command.
+Split out of ``agent.main`` so the REPL dispatcher stays focused on input
+parsing. Exposes:
+* ``SUGGESTED_MODELS`` — the short list shown by ``/model`` with no arg.
+* ``is_valid_model_id`` — loose format check on user input.
+* ``probe_and_switch_model`` — async: checks routing, fires a 1-token
+  probe to resolve the effort cascade, then commits the switch (or
+  rejects it on hard error).
+The probe's cascade lives in ``agent.core.effort_probe``; this module
+glues it to CLI output + session state.
+"""
+from __future__ import annotations
+from agent.core.effort_probe import ProbeInconclusive, probe_effort
+# Suggested models shown by `/model` (not a gate). Users can paste any HF
+# model id (e.g. "MiniMaxAI/MiniMax-M2.7") or an `anthropic/` / `openai/`
+# prefix for direct API access. For HF ids, append ":fastest" /
+# ":cheapest" / ":preferred" / ":<provider>" to override the default
+# routing policy (auto = fastest with failover).
+SUGGESTED_MODELS = [
+    {"id": "anthropic/claude-opus-4-7", "label": "Claude Opus 4.7"},
+    {"id": "anthropic/claude-opus-4-6", "label": "Claude Opus 4.6"},
+    {"id": "MiniMaxAI/MiniMax-M2.7", "label": "MiniMax M2.7"},
+    {"id": "moonshotai/Kimi-K2.6", "label": "Kimi K2.6"},
+    {"id": "zai-org/GLM-5.1", "label": "GLM 5.1"},
+]
+_ROUTING_POLICIES = {"fastest", "cheapest", "preferred"}
+def is_valid_model_id(model_id: str) -> bool:
+    """Loose format check — lets users pick any model id.
+    Accepts:
+      • anthropic/<model>
+      • openai/<model>
+      • <org>/<model>[:<tag>]            (HF router; tag = provider or policy)
+      • huggingface/<org>/<model>[:<tag>] (same, accepts legacy prefix)
+    Actual availability is verified against the HF router catalog on
+    switch, and by the provider on the probe's ping call.
+    """
+    if not model_id or "/" not in model_id:
+        return False
+    head = model_id.split(":", 1)[0]
+    parts = head.split("/")
+    return len(parts) >= 2 and all(parts)
+def _print_hf_routing_info(model_id: str, console) -> bool:
+    """Show HF router catalog info (providers, price, context, tool support)
+    for an HF-router model id. Returns ``True`` to signal the caller can
+    proceed with the switch, ``False`` to indicate a hard problem the user
+    should notice before we fire the effort probe.
+    Anthropic / OpenAI ids return ``True`` without printing anything —
+    the probe below covers "does this model exist".
+    """
+    if model_id.startswith(("anthropic/", "openai/")):
+        return True
+    from agent.core import hf_router_catalog as cat
+    bare, _, tag = model_id.partition(":")
+    info = cat.lookup(bare)
+    if info is None:
+        console.print(
+            f"[bold red]Warning:[/bold red] '{bare}' isn't in the HF router "
+            "catalog. Checking anyway — first call may fail."
+        )
+        suggestions = cat.fuzzy_suggest(bare)
+        if suggestions:
+            console.print(f"[dim]Did you mean: {', '.join(suggestions)}[/dim]")
+        return True
+    live = info.live_providers
+    if not live:
+        console.print(
+            f"[bold red]Warning:[/bold red] '{bare}' has no live providers "
+            "right now. First call will likely fail."
+        )
+        return True
+    if tag and tag not in _ROUTING_POLICIES:
+        matched = [p for p in live if p.provider == tag]
+        if not matched:
+            names = ", ".join(p.provider for p in live)
+            console.print(
+                f"[bold red]Warning:[/bold red] provider '{tag}' doesn't serve "
+                f"'{bare}'. Live providers: {names}. Checking anyway."
+            )
+    if not info.any_supports_tools:
+        console.print(
+            f"[bold red]Warning:[/bold red] no provider for '{bare}' advertises "
+            "tool-call support. This agent relies on tool calls — expect errors."
+        )
+    if tag in _ROUTING_POLICIES:
+        policy = tag
+    elif tag:
+        policy = f"pinned to {tag}"
+    else:
+        policy = "auto (fastest)"
+    console.print(f"  [dim]routing: {policy}[/dim]")
+    for p in live:
+        price = (
+            f"${p.input_price:g}/${p.output_price:g} per M tok"
+            if p.input_price is not None and p.output_price is not None
+            else "price n/a"
+        )
+        ctx = f"{p.context_length:,} ctx" if p.context_length else "ctx n/a"
+        tools = "tools" if p.supports_tools else "no tools"
+        console.print(
+            f"  [dim]{p.provider}: {price}, {ctx}, {tools}[/dim]"
+        )
+    return True
+def print_model_listing(config, console) -> None:
+    """Render the default ``/model`` (no-arg) view: current + suggested."""
+    current = config.model_name if config else ""
+    console.print("[bold]Current model:[/bold]")
+    console.print(f"  {current}")
+    console.print("\n[bold]Suggested:[/bold]")
+    for m in SUGGESTED_MODELS:
+        marker = " [dim]<-- current[/dim]" if m["id"] == current else ""
+        console.print(f"  {m['id']}  [dim]({m['label']})[/dim]{marker}")
+    console.print(
+        "\n[dim]Paste any HF model id (e.g. 'MiniMaxAI/MiniMax-M2.7').\n"
+        "Add ':fastest', ':cheapest', ':preferred', or ':<provider>' to override routing.\n"
+        "Use 'anthropic/<model>' or 'openai/<model>' for direct API access.[/dim]"
+    )
+def print_invalid_id(arg: str, console) -> None:
+    console.print(f"[bold red]Invalid model id format:[/bold red] {arg}")
+    console.print(
+        "[dim]Expected:\n"
+        "  • <org>/<model>[:tag]    (HF router — paste from huggingface.co)\n"
+        "  • anthropic/<model>\n"
+        "  • openai/<model>[/dim]"
+    )
+async def probe_and_switch_model(
+    model_id: str,
+    config,
+    session,
+    console,
+    hf_token: str | None,
+) -> None:
+    """Validate model+effort with a 1-token ping, cache the effective effort,
+    then commit the switch.
+    Three visible outcomes:
+    * ✓ ``effort: <level>`` — model accepted the preferred effort (or a
+      fallback from the cascade; the note explains if so)
+    * ✓ ``effort: off`` — model doesn't support thinking; we'll strip it
+    * ✗ hard error (auth, model-not-found, quota) — we reject the switch
+      and keep the current model so the user isn't stranded
+    Transient errors (5xx, timeout) complete the switch with a yellow
+    warning; the next real call re-surfaces the error if it's persistent.
+    """
+    preference = config.reasoning_effort
+    if not _print_hf_routing_info(model_id, console):
+        return
+    if not preference:
+        # Nothing to validate with a ping that we couldn't validate on the
+        # first real call just as cheaply. Skip the probe entirely.
+        _commit_switch(model_id, config, session, effective=None, cache=False)
+        console.print(f"[green]Model switched to {model_id}[/green] [dim](effort: off)[/dim]")
+        return
+    console.print(f"[dim]checking {model_id} (effort: {preference})...[/dim]")
+    try:
+        outcome = await probe_effort(model_id, preference, hf_token)
+    except ProbeInconclusive as e:
+        _commit_switch(model_id, config, session, effective=None, cache=False)
+        console.print(
+            f"[yellow]Model switched to {model_id}[/yellow] "
+            f"[dim](couldn't validate: {e}; will verify on first message)[/dim]"
+        )
+        return
+    except Exception as e:
+        # Hard persistent error — auth, unknown model, quota. Don't switch.
+        console.print(f"[bold red]Switch failed:[/bold red] {e}")
+        console.print(f"[dim]Keeping current model: {config.model_name}[/dim]")
+        return
+    _commit_switch(
+        model_id, config, session,
+        effective=outcome.effective_effort, cache=True,
+    )
+    effort_label = outcome.effective_effort or "off"
+    suffix = f" — {outcome.note}" if outcome.note else ""
+    console.print(
+        f"[green]Model switched to {model_id}[/green] "
+        f"[dim](effort: {effort_label}{suffix}, {outcome.elapsed_ms}ms)[/dim]"
+    )
+def _commit_switch(model_id, config, session, effective, cache: bool) -> None:
+    """Apply the switch to the session (or bare config if no session yet).
+    ``effective`` is the probe's resolved effort; ``cache=True`` stores it
+    in the session's per-model cache so real calls use the resolved level
+    instead of re-probing. ``cache=False`` (inconclusive probe / effort
+    off) leaves the cache untouched — next call falls back to preference.
+    """
+    if session is not None:
+        session.update_model(model_id)
+        if cache:
+            session.model_effective_effort[model_id] = effective
+        else:
+            session.model_effective_effort.pop(model_id, None)
+    else:
+        config.model_name = model_id

agent/core/prompt_caching.py ADDED Viewed

	@@ -0,0 +1,59 @@

+"""Anthropic prompt caching breakpoints for outgoing LLM requests.
+Caching is GA on Anthropic's API and natively supported by litellm >=1.83
+via ``cache_control`` blocks. We apply two breakpoints (out of 4 allowed):
+  1. The tool block — caches all tool definitions as a single prefix.
+  2. The system message — caches the rendered system prompt.
+Together these cover the ~4-5K static tokens that were being re-billed on
+every turn. Subsequent turns within the 5-minute TTL hit cache_read pricing
+(~10% of input cost) instead of full input.
+Non-Anthropic models (HF router, OpenAI) are passed through unchanged.
+"""
+from typing import Any
+def with_prompt_caching(
+    messages: list[Any],
+    tools: list[dict] | None,
+    model_name: str | None,
+) -> tuple[list[Any], list[dict] | None]:
+    """Return (messages, tools) with cache_control breakpoints for Anthropic.
+    No-op for non-Anthropic models. Original objects are not mutated; a fresh
+    list with replaced first message and last tool is returned, so callers
+    that share the underlying ``ContextManager.items`` list don't see their
+    persisted history rewritten.
+    """
+    if not model_name or not model_name.startswith("anthropic/"):
+        return messages, tools
+    if tools:
+        new_tools = list(tools)
+        last = dict(new_tools[-1])
+        last["cache_control"] = {"type": "ephemeral"}
+        new_tools[-1] = last
+        tools = new_tools
+    if messages:
+        first = messages[0]
+        role = first.get("role") if isinstance(first, dict) else getattr(first, "role", None)
+        if role == "system":
+            content = (
+                first.get("content")
+                if isinstance(first, dict)
+                else getattr(first, "content", None)
+            )
+            if isinstance(content, str) and content:
+                cached_block = [{
+                    "type": "text",
+                    "text": content,
+                    "cache_control": {"type": "ephemeral"},
+                }]
+                new_first = {"role": "system", "content": cached_block}
+                messages = [new_first] + list(messages[1:])
+    return messages, tools

agent/core/session.py CHANGED Viewed

@@ -109,6 +109,16 @@ class Session:
         self.turn_count: int = 0
         self.last_auto_save_turn: int = 0
     async def send_event(self, event: Event) -> None:
         """Send event back to client and log to trajectory"""
         await self.event_queue.put(event)
@@ -139,6 +149,19 @@ class Session:
         self.config.model_name = model_name
         self.context_manager.model_max_tokens = _get_max_tokens_safe(model_name)
     def increment_turn(self) -> None:
         """Increment turn counter (called after each user interaction)"""
         self.turn_count += 1

         self.turn_count: int = 0
         self.last_auto_save_turn: int = 0
+        # Per-model probed reasoning-effort cache. Populated by the probe
+        # on /model switch, read by ``effective_effort_for`` below. Keys are
+        # raw model ids (including any ``:tag``). Values:
+        #   str  → the effort level to send (may be a downgrade from the
+        #          preference, e.g. "high" when user asked for "max")
+        #   None → model rejected all efforts in the cascade; send no
+        #          thinking params at all
+        # Key absent → not probed yet; fall back to the raw preference.
+        self.model_effective_effort: dict[str, str | None] = {}
     async def send_event(self, event: Event) -> None:
         """Send event back to client and log to trajectory"""
         await self.event_queue.put(event)
         self.config.model_name = model_name
         self.context_manager.model_max_tokens = _get_max_tokens_safe(model_name)
+    def effective_effort_for(self, model_name: str) -> str | None:
+        """Resolve the effort level to actually send for ``model_name``.
+        Returns the probed result when we have one (may be ``None`` meaning
+        "model doesn't do thinking, strip it"), else the raw preference.
+        Unknown-model case falls back to the preference so a stale cache
+        from a prior ``/model`` can't poison research sub-calls that use a
+        different model id.
+        """
+        if model_name in self.model_effective_effort:
+            return self.model_effective_effort[model_name]
+        return self.config.reasoning_effort
     def increment_turn(self) -> None:
         """Increment turn counter (called after each user interaction)"""
         self.turn_count += 1

agent/main.py CHANGED Viewed

@@ -22,6 +22,7 @@ from prompt_toolkit import PromptSession
 from agent.config import load_config
 from agent.core.agent_loop import submission_loop
 from agent.core.session import OpType
 from agent.core.tools import ToolRouter
 from agent.utils.reliability_checks import check_training_script_save_pattern
@@ -49,39 +50,6 @@ litellm.drop_params = True
 # on every error — users don't need it, and our friendly errors cover the case.
 litellm.suppress_debug_info = True
-# ── Suggested models shown by `/model` (not a gate) ──────────────────────
-# Users can paste any HF model id (e.g. "MiniMaxAI/MiniMax-M2.7") or use one
-# of the `anthropic/` / `openai/` prefixes for direct API access. For HF ids,
-# append ":fastest" / ":cheapest" / ":preferred" / ":<provider>" to override
-# the default routing policy (auto = fastest with failover).
-SUGGESTED_MODELS = [
-    {"id": "anthropic/claude-opus-4-6", "label": "Claude Opus 4.6"},
-    {"id": "MiniMaxAI/MiniMax-M2.7", "label": "MiniMax M2.7"},
-    {"id": "moonshotai/Kimi-K2.6", "label": "Kimi K2.6"},
-    {"id": "zai-org/GLM-5.1", "label": "GLM 5.1"},
-]
-def _is_valid_model_id(model_id: str) -> bool:
-    """Loose format check — lets users pick any model id.
-    Accepts:
-      • anthropic/<model>
-      • openai/<model>
-      • <org>/<model>[:<tag>]            (HF router; tag = provider or policy)
-      • huggingface/<org>/<model>[:<tag>] (same, accepts legacy prefix)
-    Actual availability is verified against the HF router catalog on switch,
-    or by the provider on first call.
-    """
-    if not model_id or "/" not in model_id:
-        return False
-    # Strip :tag suffix before structural check
-    head = model_id.split(":", 1)[0]
-    parts = head.split("/")
-    return len(parts) >= 2 and all(parts)
 def _safe_get_args(arguments: dict) -> dict:
     """Safely extract args dict from arguments, handling cases where LLM passes string."""
     args = arguments.get("args", {})
@@ -91,80 +59,6 @@ def _safe_get_args(arguments: dict) -> dict:
     return args if isinstance(args, dict) else {}
-_ROUTING_POLICIES = {"fastest", "cheapest", "preferred"}
-def _print_model_preflight(model_id: str, console) -> None:
-    """Validate a model switch against the HF router catalog and show the
-    user what they're about to use (providers, price, context, tool support).
-    Anthropic/OpenAI ids skip the catalog — those are direct API calls.
-    For unknown HF ids we print a red warning with fuzzy suggestions but
-    still allow the switch (the catalog might be lagging).
-    """
-    if model_id.startswith(("anthropic/", "openai/")):
-        console.print(f"[green]Model switched to {model_id}[/green]")
-        return
-    from agent.core import hf_router_catalog as cat
-    bare, _, tag = model_id.partition(":")
-    info = cat.lookup(bare)
-    if info is None:
-        console.print(
-            f"[bold red]Warning:[/bold red] '{bare}' isn't in the HF router "
-            "catalog. Switching anyway — first call may fail."
-        )
-        suggestions = cat.fuzzy_suggest(bare)
-        if suggestions:
-            console.print(f"[dim]Did you mean: {', '.join(suggestions)}[/dim]")
-        return
-    live = info.live_providers
-    if not live:
-        console.print(
-            f"[bold red]Warning:[/bold red] '{bare}' has no live providers "
-            "right now. First call will likely fail."
-        )
-        return
-    if tag and tag not in _ROUTING_POLICIES:
-        matched = [p for p in live if p.provider == tag]
-        if not matched:
-            names = ", ".join(p.provider for p in live)
-            console.print(
-                f"[bold red]Warning:[/bold red] provider '{tag}' doesn't serve "
-                f"'{bare}'. Live providers: {names}. Switching anyway."
-            )
-            return
-    if not info.any_supports_tools:
-        console.print(
-            f"[bold red]Warning:[/bold red] no provider for '{bare}' advertises "
-            "tool-call support. This agent relies on tool calls — expect errors."
-        )
-    console.print(f"[green]Model switched to {model_id}[/green]")
-    if tag in _ROUTING_POLICIES:
-        policy = tag
-    elif tag:
-        policy = f"pinned to {tag}"
-    else:
-        policy = "auto (fastest)"
-    console.print(f"  [dim]routing: {policy}[/dim]")
-    for p in live:
-        price = (
-            f"${p.input_price:g}/${p.output_price:g} per M tok"
-            if p.input_price is not None and p.output_price is not None
-            else "price n/a"
-        )
-        ctx = f"{p.context_length:,} ctx" if p.context_length else "ctx n/a"
-        tools = "tools" if p.supports_tools else "no tools"
-        console.print(
-            f"  [dim]{p.provider}: {price}, {ctx}, {tools}[/dim]"
-        )
 def _get_hf_token() -> str | None:
     """Get HF token from environment, huggingface_hub API, or cached token file."""
     token = os.environ.get("HF_TOKEN")
@@ -807,7 +701,7 @@ async def get_user_input(prompt_session: PromptSession) -> str:
 # Slash commands are defined in terminal_display
-def _handle_slash_command(
     cmd: str,
     config,
     session_holder: list,
@@ -817,6 +711,9 @@ def _handle_slash_command(
     """
     Handle a slash command. Returns a Submission to enqueue, or None if
     the command was handled locally (caller should set turn_complete_event).
     """
     parts = cmd.strip().split(None, 1)
     command = parts[0].lower()
@@ -843,35 +740,16 @@ def _handle_slash_command(
     if command == "/model":
         console = get_console()
         if not arg:
-            current = config.model_name if config else ""
-            console.print("[bold]Current model:[/bold]")
-            console.print(f"  {current}")
-            console.print("\n[bold]Suggested:[/bold]")
-            for m in SUGGESTED_MODELS:
-                marker = " [dim]<-- current[/dim]" if m["id"] == current else ""
-                console.print(f"  {m['id']}  [dim]({m['label']})[/dim]{marker}")
-            console.print(
-                "\n[dim]Paste any HF model id (e.g. 'MiniMaxAI/MiniMax-M2.7').\n"
-                "Add ':fastest', ':cheapest', ':preferred', or ':<provider>' to override routing.\n"
-                "Use 'anthropic/<model>' or 'openai/<model>' for direct API access.[/dim]"
-            )
             return None
-        if not _is_valid_model_id(arg):
-            console.print(f"[bold red]Invalid model id format:[/bold red] {arg}")
-            console.print(
-                "[dim]Expected:\n"
-                "  • <org>/<model>[:tag]    (HF router — paste from huggingface.co)\n"
-                "  • anthropic/<model>\n"
-                "  • openai/<model>[/dim]"
-            )
             return None
         normalized = arg.removeprefix("huggingface/")
-        _print_model_preflight(normalized, console)
         session = session_holder[0] if session_holder else None
-        if session:
-            session.update_model(normalized)
-        else:
-            config.model_name = normalized
         return None
     if command == "/yolo":
@@ -882,14 +760,19 @@ def _handle_slash_command(
     if command == "/effort":
         console = get_console()
-        valid = {"minimal", "low", "medium", "high", "off"}
         if not arg:
             current = config.reasoning_effort or "off"
-            console.print(f"[bold]Reasoning effort:[/bold] {current}")
             console.print(
-                "[dim]Set with '/effort minimal|low|medium|high|off'. "
-                "Applies to models that support it (GPT-5 / o-series, Claude "
-                "extended thinking, HF reasoning models); dropped otherwise.[/dim]"
             )
             return None
         level = arg.lower()
@@ -898,7 +781,16 @@ def _handle_slash_command(
             console.print(f"[dim]Expected one of: {', '.join(sorted(valid))}[/dim]")
             return None
         config.reasoning_effort = None if level == "off" else level
         console.print(f"[green]Reasoning effort: {level}[/green]")
         return None
     if command == "/status":
@@ -1083,7 +975,7 @@ async def main():
             # Handle slash commands
             if user_input.strip().startswith("/"):
-                sub = _handle_slash_command(
                     user_input.strip(), config, session_holder, submission_queue, submission_id
                 )
                 if sub is None:

 from agent.config import load_config
 from agent.core.agent_loop import submission_loop
+from agent.core import model_switcher
 from agent.core.session import OpType
 from agent.core.tools import ToolRouter
 from agent.utils.reliability_checks import check_training_script_save_pattern
 # on every error — users don't need it, and our friendly errors cover the case.
 litellm.suppress_debug_info = True
 def _safe_get_args(arguments: dict) -> dict:
     """Safely extract args dict from arguments, handling cases where LLM passes string."""
     args = arguments.get("args", {})
     return args if isinstance(args, dict) else {}
 def _get_hf_token() -> str | None:
     """Get HF token from environment, huggingface_hub API, or cached token file."""
     token = os.environ.get("HF_TOKEN")
 # Slash commands are defined in terminal_display
+async def _handle_slash_command(
     cmd: str,
     config,
     session_holder: list,
     """
     Handle a slash command. Returns a Submission to enqueue, or None if
     the command was handled locally (caller should set turn_complete_event).
+    Async because ``/model`` fires a probe ping to validate the model+effort
+    combo before committing the switch.
     """
     parts = cmd.strip().split(None, 1)
     command = parts[0].lower()
     if command == "/model":
         console = get_console()
         if not arg:
+            model_switcher.print_model_listing(config, console)
             return None
+        if not model_switcher.is_valid_model_id(arg):
+            model_switcher.print_invalid_id(arg, console)
             return None
         normalized = arg.removeprefix("huggingface/")
         session = session_holder[0] if session_holder else None
+        await model_switcher.probe_and_switch_model(
+            normalized, config, session, console, _get_hf_token(),
+        )
         return None
     if command == "/yolo":
     if command == "/effort":
         console = get_console()
+        valid = {"minimal", "low", "medium", "high", "xhigh", "max", "off"}
+        session = session_holder[0] if session_holder else None
         if not arg:
             current = config.reasoning_effort or "off"
+            console.print(f"[bold]Reasoning effort preference:[/bold] {current}")
+            if session and session.model_effective_effort:
+                console.print("[dim]Probed per model:[/dim]")
+                for m, eff in session.model_effective_effort.items():
+                    console.print(f"  [dim]{m}: {eff or 'off'}[/dim]")
             console.print(
+                "[dim]Set with '/effort minimal|low|medium|high|xhigh|max|off'. "
+                "'max' and 'xhigh' are Anthropic-only; the cascade falls back "
+                "to whatever the model actually accepts.[/dim]"
             )
             return None
         level = arg.lower()
             console.print(f"[dim]Expected one of: {', '.join(sorted(valid))}[/dim]")
             return None
         config.reasoning_effort = None if level == "off" else level
+        # Drop the per-model probe cache — the new preference may resolve
+        # differently. Next ``/model`` (or the retry safety net) reprobes.
+        if session is not None:
+            session.model_effective_effort.clear()
         console.print(f"[green]Reasoning effort: {level}[/green]")
+        if session is not None:
+            console.print(
+                "[dim]run /model <current> to re-probe, or send a message — "
+                "the agent adjusts automatically if the new level isn't supported.[/dim]"
+            )
         return None
     if command == "/status":
             # Handle slash commands
             if user_input.strip().startswith("/"):
+                sub = await _handle_slash_command(
                     user_input.strip(), config, session_holder, submission_queue, submission_id
                 )
                 if sub is None:

agent/tools/research_tool.py CHANGED Viewed

@@ -15,6 +15,7 @@ from litellm import Message, acompletion
 from agent.core.doom_loop import check_for_doom_loop
 from agent.core.llm_params import _resolve_llm_params
 from agent.core.session import Event
 logger = logging.getLogger(__name__)
@@ -246,10 +247,16 @@ async def research_handler(
     # Use a cheaper/faster model for research
     main_model = session.config.model_name
     research_model = _get_research_model(main_model)
     llm_params = _resolve_llm_params(
         research_model,
         getattr(session, "hf_token", None),
-        reasoning_effort=getattr(session.config, "reasoning_effort", None),
     )
     # Get read-only tool specs from the session's tool router
@@ -317,8 +324,9 @@ async def research_handler(
                 ),
             ))
             try:
                 response = await acompletion(
-                    messages=messages,
                     tools=None,  # no tools — force text response
                     stream=False,
                     timeout=120,
@@ -342,9 +350,12 @@ async def research_handler(
             ))
         try:
             response = await acompletion(
-                messages=messages,
-                tools=tool_specs if tool_specs else None,
                 tool_choice="auto",
                 stream=False,
                 timeout=120,
@@ -440,8 +451,9 @@ async def research_handler(
         ),
     ))
     try:
         response = await acompletion(
-            messages=messages,
             tools=None,
             stream=False,
             timeout=120,

 from agent.core.doom_loop import check_for_doom_loop
 from agent.core.llm_params import _resolve_llm_params
+from agent.core.prompt_caching import with_prompt_caching
 from agent.core.session import Event
 logger = logging.getLogger(__name__)
     # Use a cheaper/faster model for research
     main_model = session.config.model_name
     research_model = _get_research_model(main_model)
+    # Research is a cheap sub-call — cap the main session's effort at "high"
+    # so a user preference of ``max``/``xhigh`` (valid for Opus 4.6/4.7) doesn't
+    # propagate to a Sonnet research model that may not accept those levels.
+    # We also haven't probed this sub-model so we don't know its ceiling.
+    _pref = getattr(session.config, "reasoning_effort", None)
+    _capped = "high" if _pref in ("max", "xhigh") else _pref
     llm_params = _resolve_llm_params(
         research_model,
         getattr(session, "hf_token", None),
+        reasoning_effort=_capped,
     )
     # Get read-only tool specs from the session's tool router
                 ),
             ))
             try:
+                _msgs, _ = with_prompt_caching(messages, None, llm_params.get("model"))
                 response = await acompletion(
+                    messages=_msgs,
                     tools=None,  # no tools — force text response
                     stream=False,
                     timeout=120,
             ))
         try:
+            _msgs, _tools = with_prompt_caching(
+                messages, tool_specs if tool_specs else None, llm_params.get("model")
+            )
             response = await acompletion(
+                messages=_msgs,
+                tools=_tools,
                 tool_choice="auto",
                 stream=False,
                 timeout=120,
         ),
     ))
     try:
+        _msgs, _ = with_prompt_caching(messages, None, llm_params.get("model"))
         response = await acompletion(
+            messages=_msgs,
             tools=None,
             stream=False,
             timeout=120,

agent/utils/terminal_display.py CHANGED Viewed

@@ -440,7 +440,7 @@ HELP_TEXT = f"""\
 {_I}  [cyan]/undo[/cyan]            Undo last turn
 {_I}  [cyan]/compact[/cyan]         Compact context window
 {_I}  [cyan]/model[/cyan] [id]      Show available models or switch
-{_I}  [cyan]/effort[/cyan] [level]  Reasoning effort (minimal|low|medium|high|off)
 {_I}  [cyan]/yolo[/cyan]            Toggle auto-approve mode
 {_I}  [cyan]/status[/cyan]          Current model & turn count
 {_I}  [cyan]/quit[/cyan]            Exit"""

 {_I}  [cyan]/undo[/cyan]            Undo last turn
 {_I}  [cyan]/compact[/cyan]         Compact context window
 {_I}  [cyan]/model[/cyan] [id]      Show available models or switch
+{_I}  [cyan]/effort[/cyan] [level]  Reasoning effort (minimal|low|medium|high|xhigh|max|off)
 {_I}  [cyan]/yolo[/cyan]            Toggle auto-approve mode
 {_I}  [cyan]/status[/cyan]          Current model & turn count
 {_I}  [cyan]/quit[/cyan]            Exit"""

backend/dependencies.py CHANGED Viewed

@@ -16,6 +16,7 @@ logger = logging.getLogger(__name__)
 OPENID_PROVIDER_URL = os.environ.get("OPENID_PROVIDER_URL", "https://huggingface.co")
 AUTH_ENABLED = bool(os.environ.get("OAUTH_CLIENT_ID", ""))
 # Simple in-memory token cache: token -> (user_info, expiry_time)
 _token_cache: dict[str, tuple[dict[str, Any], float]] = {}
@@ -28,8 +29,13 @@ DEV_USER: dict[str, Any] = {
     "user_id": "dev",
     "username": "dev",
     "authenticated": True,
 }
 async def _validate_token(token: str) -> dict[str, Any] | None:
     """Validate a token against HF OAuth userinfo endpoint.
@@ -74,12 +80,86 @@ def _user_from_info(user_info: dict[str, Any]) -> dict[str, Any]:
     }
 async def _extract_user_from_token(token: str) -> dict[str, Any] | None:
     """Validate a token and return a user dict, or None."""
     user_info = await _validate_token(token)
-    if user_info:
-        return _user_from_info(user_info)
-    return None
 async def check_org_membership(token: str, org_name: str) -> bool:
@@ -141,3 +221,29 @@ async def get_current_user(request: Request) -> dict[str, Any]:
     )

 OPENID_PROVIDER_URL = os.environ.get("OPENID_PROVIDER_URL", "https://huggingface.co")
 AUTH_ENABLED = bool(os.environ.get("OAUTH_CLIENT_ID", ""))
+HF_EMPLOYEE_ORG = os.environ.get("HF_EMPLOYEE_ORG", "huggingface")
 # Simple in-memory token cache: token -> (user_info, expiry_time)
 _token_cache: dict[str, tuple[dict[str, Any], float]] = {}
     "user_id": "dev",
     "username": "dev",
     "authenticated": True,
+    "plan": "org",  # Dev runs at the Pro/Org quota tier so local testing isn't capped.
 }
+# Plan field discovery — log the whoami-v2 shape once at DEBUG so we can
+# confirm the actual key in production without hammering the HF API.
+_WHOAMI_SHAPE_LOGGED = False
 async def _validate_token(token: str) -> dict[str, Any] | None:
     """Validate a token against HF OAuth userinfo endpoint.
     }
+def _normalize_plan(whoami: dict[str, Any]) -> str:
+    """Map an HF /api/whoami-v2 payload to one of: 'free' | 'pro' | 'org'.
+    The exact field shape in whoami-v2 isn't documented for our purposes,
+    so we try a handful of likely keys and fall back to 'free'. The first
+    call logs the raw shape at DEBUG (see `_fetch_user_plan`) so we can
+    pin the real key post-deploy.
+    """
+    plan_str = ""
+    for key in ("plan", "type", "accountType"):
+        val = whoami.get(key)
+        if isinstance(val, str) and val:
+            plan_str = val.lower()
+            break
+    if not plan_str:
+        if whoami.get("isPro") is True or whoami.get("is_pro") is True:
+            return "pro"
+    if "pro" in plan_str or "enterprise" in plan_str or "team" in plan_str:
+        return "pro"
+    # Org tier: anyone in a paid / enterprise org. We don't pay for this
+    # right now, but the "pro" cap applies identically.
+    orgs = whoami.get("orgs") or []
+    if isinstance(orgs, list):
+        for org in orgs:
+            if isinstance(org, dict):
+                org_plan = str(org.get("plan") or org.get("type") or "").lower()
+                if "pro" in org_plan or "enterprise" in org_plan or "team" in org_plan:
+                    return "org"
+    return "free"
+async def _fetch_user_plan(token: str) -> str:
+    """Look up the user's HF plan via /api/whoami-v2.
+    Returns 'free' | 'pro' | 'org'. Non-200, network errors, or an unknown
+    payload shape all collapse to 'free' — safe default; we'd rather under-
+    grant the Pro cap than over-grant it on bad data.
+    """
+    global _WHOAMI_SHAPE_LOGGED
+    async with httpx.AsyncClient(timeout=5.0) as client:
+        try:
+            resp = await client.get(
+                f"{OPENID_PROVIDER_URL}/api/whoami-v2",
+                headers={"Authorization": f"Bearer {token}"},
+            )
+            if resp.status_code != 200:
+                return "free"
+            whoami = resp.json()
+        except httpx.HTTPError:
+            return "free"
+        except ValueError:
+            return "free"
+    if not _WHOAMI_SHAPE_LOGGED:
+        _WHOAMI_SHAPE_LOGGED = True
+        logger.debug(
+            "whoami-v2 payload keys: %s (sample values: plan=%r type=%r isPro=%r)",
+            sorted(whoami.keys()) if isinstance(whoami, dict) else type(whoami).__name__,
+            whoami.get("plan") if isinstance(whoami, dict) else None,
+            whoami.get("type") if isinstance(whoami, dict) else None,
+            whoami.get("isPro") if isinstance(whoami, dict) else None,
+        )
+    if not isinstance(whoami, dict):
+        return "free"
+    return _normalize_plan(whoami)
 async def _extract_user_from_token(token: str) -> dict[str, Any] | None:
     """Validate a token and return a user dict, or None."""
     user_info = await _validate_token(token)
+    if user_info is None:
+        return None
+    user = _user_from_info(user_info)
+    user["plan"] = await _fetch_user_plan(token)
+    return user
 async def check_org_membership(token: str, org_name: str) -> bool:
     )
+def _extract_token(request: Request) -> str | None:
+    """Pull the HF access token from the Authorization header or cookie.
+    Mirrors the lookup order used by ``get_current_user``.
+    """
+    auth_header = request.headers.get("Authorization", "")
+    if auth_header.startswith("Bearer "):
+        return auth_header[7:]
+    return request.cookies.get("hf_access_token")
+async def require_huggingface_org_member(request: Request) -> bool:
+    """Return True if the caller is a member of the ``huggingface`` org.
+    Used to gate endpoints that can push a session onto an Anthropic model
+    billed to the Space's ``ANTHROPIC_API_KEY``. Returns True unconditionally
+    in dev mode so local testing isn't blocked.
+    """
+    if not AUTH_ENABLED:
+        return True
+    token = _extract_token(request)
+    if not token:
+        return False
+    return await check_org_membership(token, HF_EMPLOYEE_ORG)

backend/routes/agent.py CHANGED Viewed

@@ -10,7 +10,7 @@ import logging
 import os
 from typing import Any
-from dependencies import get_current_user
 from fastapi import (
     APIRouter,
     Depends,
@@ -28,7 +28,9 @@ from models import (
     SubmitRequest,
     TruncateRequest,
 )
-from session_manager import MAX_SESSIONS, SessionCapacityError, session_manager
 from agent.core.llm_params import _resolve_llm_params
@@ -37,31 +39,99 @@ logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/api", tags=["agent"])
 AVAILABLE_MODELS = [
     {
         "id": "anthropic/claude-opus-4-6",
         "label": "Claude Opus 4.6",
         "provider": "anthropic",
         "recommended": True,
     },
     {
         "id": "MiniMaxAI/MiniMax-M2.7",
         "label": "MiniMax M2.7",
         "provider": "huggingface",
-        "recommended": True,
-    },
-    {
-        "id": "moonshotai/Kimi-K2.6",
-        "label": "Kimi K2.6",
-        "provider": "huggingface",
     },
     {
         "id": "zai-org/GLM-5.1",
         "label": "GLM 5.1",
         "provider": "huggingface",
     },
 ]
 def _check_session_access(session_id: str, user: dict[str, Any]) -> None:
     """Verify the user has access to the given session. Raises 403 or 404."""
     info = session_manager.get_session_info(session_id)
@@ -143,20 +213,6 @@ async def get_model() -> dict:
     }
-@router.post("/config/model")
-async def set_model(body: dict, user: dict = Depends(get_current_user)) -> dict:
-    """Set the LLM model. Applies to new conversations."""
-    model_id = body.get("model")
-    if not model_id:
-        raise HTTPException(status_code=400, detail="Missing 'model' field")
-    valid_ids = {m["id"] for m in AVAILABLE_MODELS}
-    if model_id not in valid_ids:
-        raise HTTPException(status_code=400, detail=f"Unknown model: {model_id}")
-    session_manager.config.model_name = model_id
-    logger.info(f"Model changed to {model_id} by {user.get('username', 'unknown')}")
-    return {"model": model_id}
 _TITLE_STRIP_CHARS = str.maketrans("", "", "`*_~#[]()")
@@ -224,6 +280,10 @@ async def create_session(
     and stored in the session so that tools (e.g. hf_jobs) can act on
     behalf of the user.
     Returns 503 if the server or user has reached the session limit.
     """
     # Extract the user's HF token (Bearer header, HttpOnly cookie, or env var)
@@ -236,9 +296,27 @@ async def create_session(
     if not hf_token:
         hf_token = os.environ.get("HF_TOKEN")
     try:
         session_id = await session_manager.create_session(
-            user_id=user["user_id"], hf_token=hf_token
         )
     except SessionCapacityError as e:
         raise HTTPException(status_code=503, detail=str(e))
@@ -254,6 +332,9 @@ async def restore_session_summary(
     conversation. The client sends its cached messages; we run the standard
     summarization prompt on them and drop the result into the new
     session's context as a user-role system note.
     """
     messages = body.get("messages")
     if not isinstance(messages, list) or not messages:
@@ -268,9 +349,17 @@ async def restore_session_summary(
     if not hf_token:
         hf_token = os.environ.get("HF_TOKEN")
     try:
         session_id = await session_manager.create_session(
-            user_id=user["user_id"], hf_token=hf_token
         )
     except SessionCapacityError as e:
         raise HTTPException(status_code=503, detail=str(e))
@@ -302,12 +391,19 @@ async def get_session(
 @router.post("/session/{session_id}/model")
 async def set_session_model(
-    session_id: str, body: dict, user: dict = Depends(get_current_user)
 ) -> dict:
     """Switch the active model for a single session (tab-scoped).
     Takes effect on the next LLM call in that session — other sessions
-    (including other browser tabs) are unaffected.
     """
     _check_session_access(session_id, user)
     model_id = body.get("model")
@@ -316,6 +412,7 @@ async def set_session_model(
     valid_ids = {m["id"] for m in AVAILABLE_MODELS}
     if model_id not in valid_ids:
         raise HTTPException(status_code=400, detail=f"Unknown model: {model_id}")
     agent_session = session_manager.sessions.get(session_id)
     if not agent_session:
         raise HTTPException(status_code=404, detail="Session not found")
@@ -327,6 +424,20 @@ async def set_session_model(
     return {"session_id": session_id, "model": model_id}
 @router.get("/sessions", response_model=list[SessionInfo])
 async def list_sessions(user: dict = Depends(get_current_user)) -> list[SessionInfo]:
     """List sessions belonging to the authenticated user."""
@@ -352,6 +463,9 @@ async def submit_input(
 ) -> dict:
     """Submit user input to a session. Only accessible by the session owner."""
     _check_session_access(request.session_id, user)
     success = await session_manager.submit_user_input(request.session_id, request.text)
     if not success:
         raise HTTPException(status_code=404, detail="Session not found or inactive")
@@ -404,6 +518,16 @@ async def chat_sse(
     text = body.get("text")
     approvals = body.get("approvals")
     try:
         if approvals:
             formatted = [

 import os
 from typing import Any
+from dependencies import get_current_user, require_huggingface_org_member
 from fastapi import (
     APIRouter,
     Depends,
     SubmitRequest,
     TruncateRequest,
 )
+from session_manager import MAX_SESSIONS, AgentSession, SessionCapacityError, session_manager
+import user_quotas
 from agent.core.llm_params import _resolve_llm_params
 router = APIRouter(prefix="/api", tags=["agent"])
 AVAILABLE_MODELS = [
+    {
+        "id": "moonshotai/Kimi-K2.6",
+        "label": "Kimi K2.6",
+        "provider": "huggingface",
+        "tier": "free",
+        "recommended": True,
+    },
     {
         "id": "anthropic/claude-opus-4-6",
         "label": "Claude Opus 4.6",
         "provider": "anthropic",
+        "tier": "pro",
         "recommended": True,
     },
     {
         "id": "MiniMaxAI/MiniMax-M2.7",
         "label": "MiniMax M2.7",
         "provider": "huggingface",
+        "tier": "free",
     },
     {
         "id": "zai-org/GLM-5.1",
         "label": "GLM 5.1",
         "provider": "huggingface",
+        "tier": "free",
     },
 ]
+async def _require_hf_for_anthropic(request: Request, model_id: str) -> None:
+    """403 if a non-``huggingface``-org user tries to select an Anthropic model.
+    Anthropic models are billed to the Space's ``ANTHROPIC_API_KEY``; every
+    other model in ``AVAILABLE_MODELS`` is routed through HF Router and
+    billed via ``X-HF-Bill-To``. The gate only fires for ``anthropic/*`` so
+    non-HF users can still freely switch between the free models.
+    Pattern: https://github.com/huggingface/ml-intern/pull/63
+    """
+    if not model_id.startswith("anthropic/"):
+        return
+    if not await require_huggingface_org_member(request):
+        raise HTTPException(
+            status_code=403,
+            detail={
+                "error": "anthropic_restricted",
+                "message": (
+                    "Opus is gated to HF staff. Pick a free model — "
+                    "Kimi K2.6, MiniMax M2.7, or GLM 5.1 — instead."
+                ),
+            },
+        )
+async def _enforce_claude_quota(
+    user: dict[str, Any],
+    agent_session: AgentSession,
+) -> None:
+    """Charge the user's daily Claude quota on first use of Anthropic in a session.
+    Runs at *message-submit* time, not session-create time — so spinning up a
+    Claude session to look around doesn't burn quota. The ``claude_counted``
+    flag on ``AgentSession`` guards against re-counting the same session.
+    No-ops when the session's current model isn't Anthropic, or when this
+    session has already been charged. Raises 429 when the user has hit
+    their daily cap.
+    """
+    if agent_session.claude_counted:
+        return
+    model_name = agent_session.session.config.model_name
+    if not model_name.startswith("anthropic/"):
+        return
+    user_id = user["user_id"]
+    used = await user_quotas.get_claude_used_today(user_id)
+    cap = user_quotas.daily_cap_for(user.get("plan"))
+    if used >= cap:
+        raise HTTPException(
+            status_code=429,
+            detail={
+                "error": "claude_daily_cap",
+                "plan": user.get("plan", "free"),
+                "cap": cap,
+                "message": (
+                    "Daily Claude limit reached. Upgrade to HF Pro for "
+                    f"{user_quotas.CLAUDE_PRO_DAILY}/day or use a free model."
+                ),
+            },
+        )
+    await user_quotas.increment_claude(user_id)
+    agent_session.claude_counted = True
 def _check_session_access(session_id: str, user: dict[str, Any]) -> None:
     """Verify the user has access to the given session. Raises 403 or 404."""
     info = session_manager.get_session_info(session_id)
     }
 _TITLE_STRIP_CHARS = str.maketrans("", "", "`*_~#[]()")
     and stored in the session so that tools (e.g. hf_jobs) can act on
     behalf of the user.
+    Optional body ``{"model"?: <id>}`` selects the session's LLM; unknown
+    ids are rejected (400). The Claude-quota gate runs at message-submit
+    time, not here — spinning up an Opus session to look around is free.
     Returns 503 if the server or user has reached the session limit.
     """
     # Extract the user's HF token (Bearer header, HttpOnly cookie, or env var)
     if not hf_token:
         hf_token = os.environ.get("HF_TOKEN")
+    # Optional model override. Empty body falls back to the config default.
+    model: str | None = None
+    try:
+        body = await request.json()
+    except Exception:
+        body = None
+    if isinstance(body, dict):
+        model = body.get("model")
+    valid_ids = {m["id"] for m in AVAILABLE_MODELS}
+    if model and model not in valid_ids:
+        raise HTTPException(status_code=400, detail=f"Unknown model: {model}")
+    # Opus is gated to HF staff (PR #63). Only fires when the resolved model
+    # is Anthropic; free models pass through.
+    resolved_model = model or session_manager.config.model_name
+    await _require_hf_for_anthropic(request, resolved_model)
     try:
         session_id = await session_manager.create_session(
+            user_id=user["user_id"], hf_token=hf_token, model=model
         )
     except SessionCapacityError as e:
         raise HTTPException(status_code=503, detail=str(e))
     conversation. The client sends its cached messages; we run the standard
     summarization prompt on them and drop the result into the new
     session's context as a user-role system note.
+    Optional ``"model"`` in the body overrides the session's LLM. The
+    Claude-quota gate runs at message-submit time, not here.
     """
     messages = body.get("messages")
     if not isinstance(messages, list) or not messages:
     if not hf_token:
         hf_token = os.environ.get("HF_TOKEN")
+    model = body.get("model")
+    valid_ids = {m["id"] for m in AVAILABLE_MODELS}
+    if model and model not in valid_ids:
+        raise HTTPException(status_code=400, detail=f"Unknown model: {model}")
+    resolved_model = model or session_manager.config.model_name
+    await _require_hf_for_anthropic(request, resolved_model)
     try:
         session_id = await session_manager.create_session(
+            user_id=user["user_id"], hf_token=hf_token, model=model
         )
     except SessionCapacityError as e:
         raise HTTPException(status_code=503, detail=str(e))
 @router.post("/session/{session_id}/model")
 async def set_session_model(
+    session_id: str,
+    body: dict,
+    request: Request,
+    user: dict = Depends(get_current_user),
 ) -> dict:
     """Switch the active model for a single session (tab-scoped).
     Takes effect on the next LLM call in that session — other sessions
+    (including other browser tabs) are unaffected. Model switches don't
+    charge quota — the Claude-quota gate only fires at message-submit time.
+    Switching TO an Anthropic model requires HF org membership (PR #63);
+    free-model switches are unrestricted.
     """
     _check_session_access(session_id, user)
     model_id = body.get("model")
     valid_ids = {m["id"] for m in AVAILABLE_MODELS}
     if model_id not in valid_ids:
         raise HTTPException(status_code=400, detail=f"Unknown model: {model_id}")
+    await _require_hf_for_anthropic(request, model_id)
     agent_session = session_manager.sessions.get(session_id)
     if not agent_session:
         raise HTTPException(status_code=404, detail="Session not found")
     return {"session_id": session_id, "model": model_id}
+@router.get("/user/quota")
+async def get_user_quota(user: dict = Depends(get_current_user)) -> dict:
+    """Return the user's plan tier and today's Claude-session quota state."""
+    plan = user.get("plan", "free")
+    used = await user_quotas.get_claude_used_today(user["user_id"])
+    cap = user_quotas.daily_cap_for(plan)
+    return {
+        "plan": plan,
+        "claude_used_today": used,
+        "claude_daily_cap": cap,
+        "claude_remaining": max(0, cap - used),
+    }
 @router.get("/sessions", response_model=list[SessionInfo])
 async def list_sessions(user: dict = Depends(get_current_user)) -> list[SessionInfo]:
     """List sessions belonging to the authenticated user."""
 ) -> dict:
     """Submit user input to a session. Only accessible by the session owner."""
     _check_session_access(request.session_id, user)
+    agent_session = session_manager.sessions.get(request.session_id)
+    if agent_session is not None:
+        await _enforce_claude_quota(user, agent_session)
     success = await session_manager.submit_user_input(request.session_id, request.text)
     if not success:
         raise HTTPException(status_code=404, detail="Session not found or inactive")
     text = body.get("text")
     approvals = body.get("approvals")
+    # Gate user-message sends against the daily Claude quota. Approvals are
+    # continuations of an in-progress turn — the session was already charged
+    # on its first message, so we skip the gate there.
+    if text is not None and not approvals:
+        try:
+            await _enforce_claude_quota(user, agent_session)
+        except HTTPException:
+            broadcaster.unsubscribe(sub_id)
+            raise
     try:
         if approvals:
             formatted = [

backend/session_manager.py CHANGED Viewed

@@ -91,6 +91,10 @@ class AgentSession:
     is_active: bool = True
     is_processing: bool = False  # True while a submission is being executed
     broadcaster: Any = None
 class SessionCapacityError(Exception):
@@ -126,7 +130,12 @@ class SessionManager:
             if s.user_id == user_id and s.is_active
         )
-    async def create_session(self, user_id: str = "dev", hf_token: str | None = None) -> str:
         """Create a new agent session and return its ID.
         Session() and ToolRouter() constructors contain blocking I/O
@@ -135,6 +144,10 @@ class SessionManager:
         Args:
             user_id: The ID of the user who owns this session.
         Raises:
             SessionCapacityError: If the server or user has reached the
@@ -175,6 +188,8 @@ class SessionManager:
             # Deep-copy config so each session's model switches independently —
             # tab A picking GLM doesn't flip tab B off Claude.
             session_config = self.config.model_copy(deep=True)
             session = Session(
                 event_queue, config=session_config, tool_router=tool_router,
                 hf_token=hf_token,

     is_active: bool = True
     is_processing: bool = False  # True while a submission is being executed
     broadcaster: Any = None
+    # True once this session has been counted against the user's daily
+    # Claude quota. Guards double-counting when the user re-selects an
+    # Anthropic model mid-session.
+    claude_counted: bool = False
 class SessionCapacityError(Exception):
             if s.user_id == user_id and s.is_active
         )
+    async def create_session(
+        self,
+        user_id: str = "dev",
+        hf_token: str | None = None,
+        model: str | None = None,
+    ) -> str:
         """Create a new agent session and return its ID.
         Session() and ToolRouter() constructors contain blocking I/O
         Args:
             user_id: The ID of the user who owns this session.
+            hf_token: The user's HF OAuth token, stored for tool execution.
+            model: Optional model override. When set, replaces ``model_name``
+                on the per-session config clone. None falls back to the
+                config default.
         Raises:
             SessionCapacityError: If the server or user has reached the
             # Deep-copy config so each session's model switches independently —
             # tab A picking GLM doesn't flip tab B off Claude.
             session_config = self.config.model_copy(deep=True)
+            if model:
+                session_config.model_name = model
             session = Session(
                 event_queue, config=session_config, tool_router=tool_router,
                 hf_token=hf_token,

backend/user_quotas.py ADDED Viewed

	@@ -0,0 +1,83 @@

+"""In-memory daily quota for Claude session creations.
+Tracks per-user Claude session starts against a daily cap derived from the
+user's HF plan. Caps reset at UTC midnight; the store itself is in-process
+and wipes on restart (deliberate — the cost of occasional over-subsidy at
+restart is much lower than running a DB).
+Unit: session *creations*, not messages. A user who selects Claude in a new
+session consumes one quota point; switching an existing Claude session to
+Claude again doesn't (`AgentSession.claude_counted` guards that).
+Cap tiers:
+  free user   → CLAUDE_FREE_DAILY (1)
+  pro / org   → CLAUDE_PRO_DAILY  (20)
+"""
+import asyncio
+import os
+from datetime import UTC, datetime
+CLAUDE_FREE_DAILY: int = int(os.environ.get("CLAUDE_FREE_DAILY", "1"))
+CLAUDE_PRO_DAILY: int = int(os.environ.get("CLAUDE_PRO_DAILY", "20"))
+# user_id -> (day_utc_iso, count_for_that_day)
+_claude_counts: dict[str, tuple[str, int]] = {}
+_lock = asyncio.Lock()
+def _today() -> str:
+    return datetime.now(UTC).date().isoformat()
+def daily_cap_for(plan: str | None) -> int:
+    """Return the daily Claude-session cap for the given plan."""
+    return CLAUDE_FREE_DAILY if (plan or "free") == "free" else CLAUDE_PRO_DAILY
+async def get_claude_used_today(user_id: str) -> int:
+    """Return today's Claude session count for the user (0 if none / stale day)."""
+    async with _lock:
+        entry = _claude_counts.get(user_id)
+        if entry is None:
+            return 0
+        day, count = entry
+        if day != _today():
+            # Stale day — drop the entry so the first increment starts fresh.
+            _claude_counts.pop(user_id, None)
+            return 0
+        return count
+async def increment_claude(user_id: str) -> int:
+    """Bump today's Claude session count for the user. Returns the new value."""
+    async with _lock:
+        today = _today()
+        day, count = _claude_counts.get(user_id, (today, 0))
+        if day != today:
+            count = 0
+        count += 1
+        _claude_counts[user_id] = (today, count)
+        return count
+async def refund_claude(user_id: str) -> None:
+    """Decrement today's count — used when session creation fails after a successful gate."""
+    async with _lock:
+        entry = _claude_counts.get(user_id)
+        if entry is None:
+            return
+        day, count = entry
+        if day != _today():
+            _claude_counts.pop(user_id, None)
+            return
+        new_count = max(0, count - 1)
+        if new_count == 0:
+            _claude_counts.pop(user_id, None)
+        else:
+            _claude_counts[user_id] = (day, new_count)
+def _reset_for_tests() -> None:
+    """Test-only: clear the in-memory store."""
+    _claude_counts.clear()

frontend/src/components/Chat/ChatInput.tsx CHANGED Viewed

@@ -4,6 +4,10 @@ import ArrowUpwardIcon from '@mui/icons-material/ArrowUpward';
 import ArrowDropDownIcon from '@mui/icons-material/ArrowDropDown';
 import StopIcon from '@mui/icons-material/Stop';
 import { apiFetch } from '@/utils/api';
 // Model configuration
 interface ModelOption {
@@ -21,6 +25,14 @@ const getHfAvatarUrl = (modelId: string) => {
 };
 const MODEL_OPTIONS: ModelOption[] = [
   {
     id: 'claude-opus',
     name: 'Claude Opus 4.6',
@@ -35,14 +47,6 @@ const MODEL_OPTIONS: ModelOption[] = [
     description: 'Novita',
     modelPath: 'MiniMaxAI/MiniMax-M2.7',
     avatarUrl: getHfAvatarUrl('MiniMaxAI/MiniMax-M2.7'),
-    recommended: true,
-  },
-  {
-    id: 'kimi-k2.6',
-    name: 'Kimi K2.6',
-    description: 'Novita',
-    modelPath: 'moonshotai/Kimi-K2.6',
-    avatarUrl: getHfAvatarUrl('moonshotai/Kimi-K2.6'),
   },
   {
     id: 'glm-5.1',
@@ -66,11 +70,23 @@ interface ChatInputProps {
   placeholder?: string;
 }
 export default function ChatInput({ sessionId, onSend, onStop, isProcessing = false, disabled = false, placeholder = 'Ask anything...' }: ChatInputProps) {
   const [input, setInput] = useState('');
   const inputRef = useRef<HTMLTextAreaElement>(null);
   const [selectedModelId, setSelectedModelId] = useState<string>(MODEL_OPTIONS[0].id);
   const [modelAnchorEl, setModelAnchorEl] = useState<null | HTMLElement>(null);
   // Model is per-session: fetch this tab's current model every time the
   // session changes. Other tabs keep their own selections independently.
@@ -101,11 +117,27 @@ export default function ChatInput({ sessionId, onSend, onStop, isProcessing = fa
   const handleSend = useCallback(() => {
     if (input.trim() && !disabled) {
       onSend(input);
       setInput('');
     }
   }, [input, disabled, onSend]);
   const handleKeyDown = useCallback(
     (e: KeyboardEvent<HTMLDivElement>) => {
       if (e.key === 'Enter' && !e.shiftKey) {
@@ -136,6 +168,45 @@ export default function ChatInput({ sessionId, onSend, onStop, isProcessing = fa
     } catch { /* ignore */ }
   };
   return (
     <Box
       sx={{
@@ -334,6 +405,19 @@ export default function ChatInput({ sessionId, onSend, onStop, isProcessing = fa
                         }}
                       />
                     )}
                   </Box>
                 }
                 secondary={model.description}
@@ -344,6 +428,14 @@ export default function ChatInput({ sessionId, onSend, onStop, isProcessing = fa
             </MenuItem>
           ))}
         </Menu>
       </Box>
     </Box>
   );

 import ArrowDropDownIcon from '@mui/icons-material/ArrowDropDown';
 import StopIcon from '@mui/icons-material/Stop';
 import { apiFetch } from '@/utils/api';
+import { useUserQuota } from '@/hooks/useUserQuota';
+import ClaudeCapDialog from '@/components/ClaudeCapDialog';
+import { useAgentStore } from '@/store/agentStore';
+import { FIRST_FREE_MODEL_PATH } from '@/utils/model';
 // Model configuration
 interface ModelOption {
 };
 const MODEL_OPTIONS: ModelOption[] = [
+  {
+    id: 'kimi-k2.6',
+    name: 'Kimi K2.6',
+    description: 'Novita',
+    modelPath: 'moonshotai/Kimi-K2.6',
+    avatarUrl: getHfAvatarUrl('moonshotai/Kimi-K2.6'),
+    recommended: true,
+  },
   {
     id: 'claude-opus',
     name: 'Claude Opus 4.6',
     description: 'Novita',
     modelPath: 'MiniMaxAI/MiniMax-M2.7',
     avatarUrl: getHfAvatarUrl('MiniMaxAI/MiniMax-M2.7'),
   },
   {
     id: 'glm-5.1',
   placeholder?: string;
 }
+const isClaudeModel = (m: ModelOption) => m.modelPath.startsWith('anthropic/');
+const firstFreeModel = () => MODEL_OPTIONS.find(m => !isClaudeModel(m)) ?? MODEL_OPTIONS[0];
 export default function ChatInput({ sessionId, onSend, onStop, isProcessing = false, disabled = false, placeholder = 'Ask anything...' }: ChatInputProps) {
   const [input, setInput] = useState('');
   const inputRef = useRef<HTMLTextAreaElement>(null);
   const [selectedModelId, setSelectedModelId] = useState<string>(MODEL_OPTIONS[0].id);
   const [modelAnchorEl, setModelAnchorEl] = useState<null | HTMLElement>(null);
+  const { quota, refresh: refreshQuota } = useUserQuota();
+  // The daily-cap dialog is triggered from two places: (a) a 429 returned
+  // from the chat transport when the user tries to send on Opus over cap —
+  // surfaced via the agent-store flag — and (b) nothing else right now
+  // (switching models is free). Keeping the open state in the store means
+  // the hook layer can flip it without threading props through.
+  const claudeQuotaExhausted = useAgentStore((s) => s.claudeQuotaExhausted);
+  const setClaudeQuotaExhausted = useAgentStore((s) => s.setClaudeQuotaExhausted);
+  const lastSentRef = useRef<string>('');
   // Model is per-session: fetch this tab's current model every time the
   // session changes. Other tabs keep their own selections independently.
   const handleSend = useCallback(() => {
     if (input.trim() && !disabled) {
+      lastSentRef.current = input;
       onSend(input);
       setInput('');
     }
   }, [input, disabled, onSend]);
+  // When the chat transport reports a Claude-quota 429, restore the typed
+  // text so the user doesn't lose their message.
+  useEffect(() => {
+    if (claudeQuotaExhausted && lastSentRef.current) {
+      setInput(lastSentRef.current);
+    }
+  }, [claudeQuotaExhausted]);
+  // Refresh the quota display whenever the session changes (user might
+  // have started another tab that spent quota).
+  useEffect(() => {
+    if (sessionId) refreshQuota();
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [sessionId]);
   const handleKeyDown = useCallback(
     (e: KeyboardEvent<HTMLDivElement>) => {
       if (e.key === 'Enter' && !e.shiftKey) {
     } catch { /* ignore */ }
   };
+  // Dialog close: just clear the flag. The typed text is already restored.
+  const handleCapDialogClose = useCallback(() => {
+    setClaudeQuotaExhausted(false);
+  }, [setClaudeQuotaExhausted]);
+  // "Use a free model" — switch the current session to Kimi (or the first
+  // non-Anthropic option) and auto-retry the send that tripped the cap.
+  const handleUseFreeModel = useCallback(async () => {
+    setClaudeQuotaExhausted(false);
+    if (!sessionId) return;
+    const free = MODEL_OPTIONS.find(m => m.modelPath === FIRST_FREE_MODEL_PATH)
+      ?? firstFreeModel();
+    try {
+      const res = await apiFetch(`/api/session/${sessionId}/model`, {
+        method: 'POST',
+        body: JSON.stringify({ model: free.modelPath }),
+      });
+      if (res.ok) {
+        setSelectedModelId(free.id);
+        const retryText = lastSentRef.current;
+        if (retryText) {
+          onSend(retryText);
+          setInput('');
+          lastSentRef.current = '';
+        }
+      }
+    } catch { /* ignore */ }
+  }, [sessionId, onSend, setClaudeQuotaExhausted]);
+  // Hide the chip until the user has actually burned quota — an unused
+  // Opus session shouldn't populate a counter.
+  const claudeChip = (() => {
+    if (!quota || quota.claudeUsedToday === 0) return null;
+    if (quota.plan === 'free') {
+      return quota.claudeRemaining > 0 ? 'Free today' : 'Pro only';
+    }
+    return `${quota.claudeUsedToday}/${quota.claudeDailyCap} today`;
+  })();
   return (
     <Box
       sx={{
                         }}
                       />
                     )}
+                    {isClaudeModel(model) && claudeChip && (
+                      <Chip
+                        label={claudeChip}
+                        size="small"
+                        sx={{
+                          height: '18px',
+                          fontSize: '10px',
+                          bgcolor: 'rgba(255,255,255,0.08)',
+                          color: 'var(--muted-text)',
+                          fontWeight: 600,
+                        }}
+                      />
+                    )}
                   </Box>
                 }
                 secondary={model.description}
             </MenuItem>
           ))}
         </Menu>
+        <ClaudeCapDialog
+          open={claudeQuotaExhausted}
+          plan={quota?.plan ?? 'free'}
+          cap={quota?.claudeDailyCap ?? 1}
+          onClose={handleCapDialogClose}
+          onUseFreeModel={handleUseFreeModel}
+        />
       </Box>
     </Box>
   );

frontend/src/components/ClaudeCapDialog.tsx ADDED Viewed

	@@ -0,0 +1,134 @@

+import {
+  Box,
+  Button,
+  Dialog,
+  DialogActions,
+  DialogContent,
+  DialogContentText,
+  DialogTitle,
+  Typography,
+} from '@mui/material';
+import type { PlanTier } from '@/hooks/useUserQuota';
+const HF_PRICING_URL = 'https://huggingface.co/pricing';
+const PRO_CAP = 20;
+interface ClaudeCapDialogProps {
+  open: boolean;
+  plan: PlanTier;
+  cap: number;
+  onClose: () => void;
+  onUseFreeModel: () => void;
+}
+export default function ClaudeCapDialog({
+  open,
+  plan,
+  cap,
+  onClose,
+  onUseFreeModel,
+}: ClaudeCapDialogProps) {
+  // plan not surfaced in copy right now — Pro users see the same dialog and
+  // can upgrade their org if they're also capped.
+  void plan;
+  return (
+    <Dialog
+      open={open}
+      onClose={onClose}
+      slotProps={{
+        backdrop: { sx: { backgroundColor: 'rgba(0,0,0,0.5)', backdropFilter: 'blur(4px)' } },
+      }}
+      PaperProps={{
+        sx: {
+          bgcolor: 'var(--panel)',
+          border: '1px solid var(--border)',
+          borderRadius: 'var(--radius-md)',
+          boxShadow: 'var(--shadow-1)',
+          maxWidth: 460,
+          mx: 2,
+        },
+      }}
+    >
+      <DialogTitle
+        sx={{ color: 'var(--text)', fontWeight: 700, fontSize: '1rem', pt: 2.5, pb: 0, px: 3 }}
+      >
+        You've hit your Opus limit
+      </DialogTitle>
+      <DialogContent sx={{ px: 3, pt: 1.25, pb: 0 }}>
+        <DialogContentText
+          sx={{ color: 'var(--muted-text)', fontSize: '0.85rem', lineHeight: 1.6 }}
+        >
+          Opus costs an arm and a leg, so we unfortunately have to cap you at {cap}{' '}
+          {cap === 1 ? 'session' : 'sessions'} a day. Give Kimi, MiniMax, or GLM a spin —
+          they are genuinely good and we use them all the time.
+        </DialogContentText>
+        <Box
+          sx={{
+            mt: 2,
+            p: 1.5,
+            borderRadius: '8px',
+            bgcolor: 'var(--accent-yellow-weak)',
+            border: '1px solid var(--border)',
+          }}
+        >
+          <Typography
+            variant="caption"
+            sx={{
+              display: 'block',
+              fontWeight: 700,
+              color: 'var(--text)',
+              fontSize: '0.78rem',
+              mb: 0.5,
+              letterSpacing: '0.02em',
+            }}
+          >
+            HF Pro ($9/mo) — more Opus, more everything
+          </Typography>
+          <Typography
+            variant="caption"
+            sx={{ display: 'block', color: 'var(--muted-text)', fontSize: '0.78rem', lineHeight: 1.55 }}
+          >
+            {PRO_CAP} Opus sessions/day here, 20× HF Inference credits, ZeroGPU access,
+            and priority on Spaces hardware.
+          </Typography>
+        </Box>
+      </DialogContent>
+      <DialogActions sx={{ px: 3, pb: 2.5, pt: 2, gap: 1 }}>
+        <Button
+          component="a"
+          href={HF_PRICING_URL}
+          target="_blank"
+          rel="noopener noreferrer"
+          variant="contained"
+          size="small"
+          sx={{
+            fontSize: '0.82rem',
+            px: 2.5,
+            bgcolor: 'var(--accent-yellow)',
+            color: '#000',
+            textTransform: 'none',
+            fontWeight: 700,
+            boxShadow: 'none',
+            '&:hover': { bgcolor: '#FFB340', boxShadow: 'none' },
+          }}
+        >
+          Upgrade to Pro
+        </Button>
+        <Button
+          onClick={onUseFreeModel}
+          size="small"
+          sx={{
+            color: 'var(--muted-text)',
+            fontSize: '0.82rem',
+            px: 2,
+            textTransform: 'none',
+            '&:hover': { bgcolor: 'var(--hover-bg)' },
+          }}
+        >
+          Use a free model
+        </Button>
+      </DialogActions>
+    </Dialog>
+  );
+}

frontend/src/hooks/useAgentChat.ts CHANGED Viewed

@@ -345,8 +345,16 @@ export function useAgentChat({ sessionId, isActive, onReady, onError, onSessionD
     // sendMessages on the transport.
     sendAutomaticallyWhen: lastAssistantMessageIsCompleteWithApprovalResponses,
     onError: (error) => {
-      logger.error('useChat error:', error);
       updateSession(sessionId, { isProcessing: false });
       if (isActiveRef.current) {
         useAgentStore.getState().setError(error.message);
       }

     // sendMessages on the transport.
     sendAutomaticallyWhen: lastAssistantMessageIsCompleteWithApprovalResponses,
     onError: (error) => {
       updateSession(sessionId, { isProcessing: false });
+      // Claude daily-cap: open the cap dialog instead of the generic error
+      // banner. Transport marks the error with this sentinel.
+      if (error.message === 'CLAUDE_QUOTA_EXHAUSTED') {
+        if (isActiveRef.current) {
+          useAgentStore.getState().setClaudeQuotaExhausted(true);
+        }
+        return;
+      }
+      logger.error('useChat error:', error);
       if (isActiveRef.current) {
         useAgentStore.getState().setError(error.message);
       }

frontend/src/hooks/useUserQuota.ts ADDED Viewed

	@@ -0,0 +1,51 @@

+/**
+ * Reads the current user's Claude daily quota + plan tier from the backend.
+ *
+ * Fetches once when the user becomes authenticated, and exposes a `refresh()`
+ * that callers invoke after a successful session-create / model-switch so the
+ * chip reflects the new count without a full page reload.
+ */
+import { useCallback, useEffect, useState } from 'react';
+import { useAgentStore } from '@/store/agentStore';
+import { apiFetch } from '@/utils/api';
+export type PlanTier = 'free' | 'pro' | 'org';
+export interface UserQuota {
+  plan: PlanTier;
+  claudeUsedToday: number;
+  claudeDailyCap: number;
+  claudeRemaining: number;
+}
+export function useUserQuota() {
+  const user = useAgentStore((s) => s.user);
+  const [quota, setQuota] = useState<UserQuota | null>(null);
+  const [loading, setLoading] = useState(false);
+  const refresh = useCallback(async () => {
+    if (!user?.authenticated) return;
+    setLoading(true);
+    try {
+      const res = await apiFetch('/api/user/quota');
+      if (!res.ok) return;
+      const data = await res.json();
+      setQuota({
+        plan: (data.plan ?? 'free') as PlanTier,
+        claudeUsedToday: data.claude_used_today ?? 0,
+        claudeDailyCap: data.claude_daily_cap ?? 1,
+        claudeRemaining: data.claude_remaining ?? 0,
+      });
+    } catch {
+      /* backend unreachable — leave previous value */
+    } finally {
+      setLoading(false);
+    }
+  }, [user?.authenticated]);
+  useEffect(() => {
+    refresh();
+  }, [refresh]);
+  return { quota, loading, refresh };
+}

frontend/src/lib/sse-chat-transport.ts CHANGED Viewed

@@ -356,6 +356,12 @@ export class SSEChatTransport implements ChatTransport<UIMessage> {
       // it can flag the session for the catch-up banner.
       this.sideChannel.onSessionDead(sessionId);
     }
     if (!response.ok) {
       const errorText = await response.text().catch(() => 'Request failed');
       throw new Error(`Chat request failed: ${response.status} ${errorText}`);

       // it can flag the session for the catch-up banner.
       this.sideChannel.onSessionDead(sessionId);
     }
+    if (response.status === 429) {
+      // Claude daily-quota gate tripped. The prefix is the detection marker
+      // for useAgentChat's onError handler, which surfaces the cap dialog
+      // instead of a generic error banner.
+      throw new Error('CLAUDE_QUOTA_EXHAUSTED');
+    }
     if (!response.ok) {
       const errorText = await response.text().catch(() => 'Request failed');
       throw new Error(`Chat request failed: ${response.status} ${errorText}`);

frontend/src/store/agentStore.ts CHANGED Viewed

@@ -108,6 +108,8 @@ interface AgentStore {
   user: User | null;
   error: string | null;
   llmHealthError: LLMHealthError | null;
   // Right panel (single-artifact pattern)
   panelData: PanelData | null;
@@ -153,6 +155,7 @@ interface AgentStore {
   setUser: (user: User | null) => void;
   setError: (error: string | null) => void;
   setLlmHealthError: (error: LLMHealthError | null) => void;
   setPanel: (data: PanelData, view?: PanelView, editable?: boolean) => void;
   setPanelView: (view: PanelView) => void;
@@ -247,6 +250,7 @@ export const useAgentStore = create<AgentStore>()((set, get) => ({
   user: null,
   error: null,
   llmHealthError: null,
   panelData: null,
   panelView: 'script',
@@ -358,6 +362,7 @@ export const useAgentStore = create<AgentStore>()((set, get) => ({
   setUser: (user) => set({ user }),
   setError: (error) => set({ error }),
   setLlmHealthError: (error) => set({ llmHealthError: error }),
   // ── Panel (single-artifact) ───────────────────────────────────────
   // Each setter also patches the active session's snapshot so that

   user: User | null;
   error: string | null;
   llmHealthError: LLMHealthError | null;
+  /** Set when a Claude-send hits the daily quota — ChatInput opens the cap dialog in response. */
+  claudeQuotaExhausted: boolean;
   // Right panel (single-artifact pattern)
   panelData: PanelData | null;
   setUser: (user: User | null) => void;
   setError: (error: string | null) => void;
   setLlmHealthError: (error: LLMHealthError | null) => void;
+  setClaudeQuotaExhausted: (exhausted: boolean) => void;
   setPanel: (data: PanelData, view?: PanelView, editable?: boolean) => void;
   setPanelView: (view: PanelView) => void;
   user: null,
   error: null,
   llmHealthError: null,
+  claudeQuotaExhausted: false,
   panelData: null,
   panelView: 'script',
   setUser: (user) => set({ user }),
   setError: (error) => set({ error }),
   setLlmHealthError: (error) => set({ llmHealthError: error }),
+  setClaudeQuotaExhausted: (exhausted) => set({ claudeQuotaExhausted: exhausted }),
   // ── Panel (single-artifact) ───────────────────────────────────────
   // Each setter also patches the active session's snapshot so that

frontend/src/utils/model.ts ADDED Viewed

	@@ -0,0 +1,15 @@

+/**
+ * Shared model-id constants used by session-create call sites and the
+ * ClaudeCapDialog "Use a free model" escape hatch.
+ *
+ * Keep in sync with MODEL_OPTIONS in components/Chat/ChatInput.tsx and
+ * AVAILABLE_MODELS in backend/routes/agent.py. Bare HF ids (no
+ * `huggingface/` prefix) — matches upstream's auto-router.
+ */
+export const CLAUDE_MODEL_PATH = 'anthropic/claude-opus-4-6';
+export const FIRST_FREE_MODEL_PATH = 'moonshotai/Kimi-K2.6';
+export function isClaudePath(modelPath: string | undefined): boolean {
+  return !!modelPath && modelPath.startsWith('anthropic/');
+}

tests/unit/test_user_quotas.py ADDED Viewed

	@@ -0,0 +1,116 @@

+"""Tests for backend/user_quotas.py — the in-memory Claude daily-quota store."""
+import asyncio
+import os
+import sys
+from pathlib import Path
+from unittest.mock import patch
+import pytest
+# The backend package isn't on sys.path by default; add it so we can import
+# the module under test without pulling in the whole FastAPI app.
+_BACKEND_DIR = Path(__file__).resolve().parent.parent.parent / "backend"
+if str(_BACKEND_DIR) not in sys.path:
+    sys.path.insert(0, str(_BACKEND_DIR))
+import user_quotas  # noqa: E402
+@pytest.fixture(autouse=True)
+def _reset_store():
+    """Fresh in-memory store per test."""
+    user_quotas._reset_for_tests()
+    yield
+    user_quotas._reset_for_tests()
+def test_daily_cap_for_known_plans():
+    assert user_quotas.daily_cap_for("free") == user_quotas.CLAUDE_FREE_DAILY
+    assert user_quotas.daily_cap_for("pro") == user_quotas.CLAUDE_PRO_DAILY
+    assert user_quotas.daily_cap_for("org") == user_quotas.CLAUDE_PRO_DAILY
+def test_daily_cap_for_unknown_or_missing_defaults_to_free():
+    assert user_quotas.daily_cap_for(None) == user_quotas.CLAUDE_FREE_DAILY
+    assert user_quotas.daily_cap_for("") == user_quotas.CLAUDE_FREE_DAILY
+    # Anything we don't recognize as the Pro/Org tier gets the Pro cap because
+    # the function's contract is "free" is the only downgraded tier. If that
+    # ever flips, this test will flip too — adjust consciously.
+    assert user_quotas.daily_cap_for("mystery") == user_quotas.CLAUDE_PRO_DAILY
+@pytest.mark.asyncio
+async def test_increment_and_read_back_same_day():
+    assert await user_quotas.get_claude_used_today("u1") == 0
+    assert await user_quotas.increment_claude("u1") == 1
+    assert await user_quotas.increment_claude("u1") == 2
+    assert await user_quotas.get_claude_used_today("u1") == 2
+@pytest.mark.asyncio
+async def test_independent_users_do_not_share_counts():
+    await user_quotas.increment_claude("alice")
+    await user_quotas.increment_claude("alice")
+    await user_quotas.increment_claude("bob")
+    assert await user_quotas.get_claude_used_today("alice") == 2
+    assert await user_quotas.get_claude_used_today("bob") == 1
+@pytest.mark.asyncio
+async def test_stale_day_resets_before_next_read():
+    await user_quotas.increment_claude("u1")
+    # Simulate yesterday's entry still in the store.
+    user_quotas._claude_counts["u1"] = ("2000-01-01", 99)
+    assert await user_quotas.get_claude_used_today("u1") == 0
+    # And a fresh increment starts from 0.
+    assert await user_quotas.increment_claude("u1") == 1
+@pytest.mark.asyncio
+async def test_concurrent_increments_under_lock_do_not_lose_writes():
+    """50 coroutines bumping the same user must land at exactly 50."""
+    await asyncio.gather(*[user_quotas.increment_claude("race") for _ in range(50)])
+    assert await user_quotas.get_claude_used_today("race") == 50
+@pytest.mark.asyncio
+async def test_refund_decrements_and_drops_entry_at_zero():
+    await user_quotas.increment_claude("u1")
+    assert await user_quotas.get_claude_used_today("u1") == 1
+    await user_quotas.refund_claude("u1")
+    assert await user_quotas.get_claude_used_today("u1") == 0
+    assert "u1" not in user_quotas._claude_counts
+@pytest.mark.asyncio
+async def test_refund_on_nonexistent_user_is_noop():
+    await user_quotas.refund_claude("ghost")  # should not raise
+    assert await user_quotas.get_claude_used_today("ghost") == 0
+@pytest.mark.asyncio
+async def test_refund_on_stale_day_resets_rather_than_underflow():
+    user_quotas._claude_counts["u1"] = ("2000-01-01", 5)
+    await user_quotas.refund_claude("u1")
+    # Stale entry dropped; today's count stays 0.
+    assert await user_quotas.get_claude_used_today("u1") == 0
+@pytest.mark.asyncio
+async def test_free_user_cap_reached_at_one():
+    cap = user_quotas.daily_cap_for("free")
+    used = await user_quotas.increment_claude("freebie")
+    assert used == 1
+    assert used >= cap  # first bump exhausts the free tier (cap=1)
+@pytest.mark.asyncio
+async def test_pro_user_cap_reached_at_twenty():
+    cap = user_quotas.daily_cap_for("pro")
+    assert cap == 20
+    for i in range(1, 21):
+        assert await user_quotas.increment_claude("pro_user") == i
+    # 21st would exceed — the gate in routes/agent.py enforces this; here
+    # we just confirm the counter tracks past the cap so that check works.
+    assert await user_quotas.increment_claude("pro_user") == 21