akseljoonas commited on
Commit
2a1effd
·
2 Parent(s): 0718170540437a

Merge remote-tracking branch 'github/main' into space-main

Browse files
agent/config.py CHANGED
@@ -33,14 +33,15 @@ class Config(BaseModel):
33
  confirm_cpu_jobs: bool = True
34
  auto_file_upload: bool = False
35
 
36
- # Reasoning effort for models that support it (GPT-5 / o-series, Claude
37
- # extended thinking, HF reasoning models like MiniMax M2 / Kimi K2).
38
- # Defaults to "high" we'd rather spend tokens thinking than ship a
39
- # wrong ML recipe. Users can dial down with `/effort low|medium|off`.
40
- # "minimal" is an OpenAI-only level and is normalized to "low" for HF
41
- # router models (MiniMax requires ≥low). Ignored for non-reasoning models.
42
- # Valid values: None | "minimal" | "low" | "medium" | "high"
43
- reasoning_effort: str | None = "high"
 
44
 
45
 
46
  def substitute_env_vars(obj: Any) -> Any:
 
33
  confirm_cpu_jobs: bool = True
34
  auto_file_upload: bool = False
35
 
36
+ # Reasoning effort *preference* the ceiling the user wants. The probe
37
+ # on `/model` walks a cascade down from here (``max`` ``xhigh`` → ``high``
38
+ # …) and caches per-model what the provider actually accepted in
39
+ # ``Session.model_effective_effort``. Default ``max`` because we'd rather
40
+ # burn tokens thinking than ship a wrong ML recipe; the cascade lands on
41
+ # whichever level the model supports (``high`` for GPT-5 / HF router,
42
+ # ``xhigh`` or ``max`` for Anthropic 4.6 / 4.7). ``None`` = thinking off.
43
+ # Valid values: None | "minimal" | "low" | "medium" | "high" | "xhigh" | "max"
44
+ reasoning_effort: str | None = "max"
45
 
46
 
47
  def substitute_env_vars(obj: Any) -> Any:
agent/context_manager/manager.py CHANGED
@@ -13,6 +13,8 @@ import yaml
13
  from jinja2 import Template
14
  from litellm import Message, acompletion
15
 
 
 
16
  logger = logging.getLogger(__name__)
17
 
18
  _HF_WHOAMI_URL = "https://huggingface.co/api/whoami-v2"
@@ -114,6 +116,9 @@ async def summarize_messages(
114
 
115
  prompt_messages = list(messages) + [Message(role="user", content=prompt)]
116
  llm_params = _resolve_llm_params(model_name, hf_token, reasoning_effort="high")
 
 
 
117
  response = await acompletion(
118
  messages=prompt_messages,
119
  max_completion_tokens=max_tokens,
 
13
  from jinja2 import Template
14
  from litellm import Message, acompletion
15
 
16
+ from agent.core.prompt_caching import with_prompt_caching
17
+
18
  logger = logging.getLogger(__name__)
19
 
20
  _HF_WHOAMI_URL = "https://huggingface.co/api/whoami-v2"
 
116
 
117
  prompt_messages = list(messages) + [Message(role="user", content=prompt)]
118
  llm_params = _resolve_llm_params(model_name, hf_token, reasoning_effort="high")
119
+ prompt_messages, tool_specs = with_prompt_caching(
120
+ prompt_messages, tool_specs, llm_params.get("model")
121
+ )
122
  response = await acompletion(
123
  messages=prompt_messages,
124
  max_completion_tokens=max_tokens,
agent/core/agent_loop.py CHANGED
@@ -14,6 +14,7 @@ from litellm.exceptions import ContextWindowExceededError
14
  from agent.config import Config
15
  from agent.core.doom_loop import check_for_doom_loop
16
  from agent.core.llm_params import _resolve_llm_params
 
17
  from agent.core.session import Event, OpType, Session
18
  from agent.core.tools import ToolRouter
19
  from agent.tools.jobs_tool import CPU_FLAVORS
@@ -136,6 +137,58 @@ def _is_transient_error(error: Exception) -> bool:
136
  return any(pattern in err_str for pattern in transient_patterns)
137
 
138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  def _friendly_error_message(error: Exception) -> str | None:
140
  """Return a user-friendly message for known error types, or None to fall back to traceback."""
141
  err_str = str(error).lower()
@@ -243,6 +296,8 @@ class LLMResult:
243
  async def _call_llm_streaming(session: Session, messages, tools, llm_params) -> LLMResult:
244
  """Call the LLM with streaming, emitting assistant_chunk events."""
245
  response = None
 
 
246
  for _llm_attempt in range(_MAX_LLM_RETRIES):
247
  try:
248
  response = await acompletion(
@@ -258,6 +313,14 @@ async def _call_llm_streaming(session: Session, messages, tools, llm_params) ->
258
  except ContextWindowExceededError:
259
  raise
260
  except Exception as e:
 
 
 
 
 
 
 
 
261
  if _llm_attempt < _MAX_LLM_RETRIES - 1 and _is_transient_error(e):
262
  _delay = _LLM_RETRY_DELAYS[_llm_attempt]
263
  logger.warning(
@@ -328,6 +391,8 @@ async def _call_llm_streaming(session: Session, messages, tools, llm_params) ->
328
  async def _call_llm_non_streaming(session: Session, messages, tools, llm_params) -> LLMResult:
329
  """Call the LLM without streaming, emit assistant_message at the end."""
330
  response = None
 
 
331
  for _llm_attempt in range(_MAX_LLM_RETRIES):
332
  try:
333
  response = await acompletion(
@@ -342,6 +407,14 @@ async def _call_llm_non_streaming(session: Session, messages, tools, llm_params)
342
  except ContextWindowExceededError:
343
  raise
344
  except Exception as e:
 
 
 
 
 
 
 
 
345
  if _llm_attempt < _MAX_LLM_RETRIES - 1 and _is_transient_error(e):
346
  _delay = _LLM_RETRY_DELAYS[_llm_attempt]
347
  logger.warning(
@@ -490,10 +563,13 @@ class Handlers:
490
  tools = session.tool_router.get_tool_specs_for_llm()
491
  try:
492
  # ── Call the LLM (streaming or non-streaming) ──
 
 
 
493
  llm_params = _resolve_llm_params(
494
  session.config.model_name,
495
  session.hf_token,
496
- reasoning_effort=session.config.reasoning_effort,
497
  )
498
  if session.stream:
499
  llm_result = await _call_llm_streaming(session, messages, tools, llm_params)
 
14
  from agent.config import Config
15
  from agent.core.doom_loop import check_for_doom_loop
16
  from agent.core.llm_params import _resolve_llm_params
17
+ from agent.core.prompt_caching import with_prompt_caching
18
  from agent.core.session import Event, OpType, Session
19
  from agent.core.tools import ToolRouter
20
  from agent.tools.jobs_tool import CPU_FLAVORS
 
137
  return any(pattern in err_str for pattern in transient_patterns)
138
 
139
 
140
+ def _is_effort_config_error(error: Exception) -> bool:
141
+ """Catch the two 400s the effort probe also handles — thinking
142
+ unsupported for this model, or the specific effort level invalid.
143
+
144
+ This is our safety net for the case where ``/effort`` was changed
145
+ mid-conversation (which clears the probe cache) and the new level
146
+ doesn't work for the current model. We heal the cache and retry once.
147
+ """
148
+ from agent.core.effort_probe import _is_invalid_effort, _is_thinking_unsupported
149
+ return _is_thinking_unsupported(error) or _is_invalid_effort(error)
150
+
151
+
152
+ async def _heal_effort_and_rebuild_params(
153
+ session: Session, error: Exception, llm_params: dict,
154
+ ) -> dict:
155
+ """Update the session's effort cache based on ``error`` and return new
156
+ llm_params. Called only when ``_is_effort_config_error(error)`` is True.
157
+
158
+ Two branches:
159
+ • thinking-unsupported → cache ``None`` for this model, next call
160
+ strips thinking entirely
161
+ • invalid-effort → re-run the full cascade probe; the result lands
162
+ in the cache
163
+ """
164
+ from agent.core.effort_probe import ProbeInconclusive, _is_thinking_unsupported, probe_effort
165
+
166
+ model = session.config.model_name
167
+ if _is_thinking_unsupported(error):
168
+ session.model_effective_effort[model] = None
169
+ logger.info("healed: %s doesn't support thinking — stripped", model)
170
+ else:
171
+ try:
172
+ outcome = await probe_effort(
173
+ model, session.config.reasoning_effort, session.hf_token,
174
+ )
175
+ session.model_effective_effort[model] = outcome.effective_effort
176
+ logger.info(
177
+ "healed: %s effort cascade → %s", model, outcome.effective_effort,
178
+ )
179
+ except ProbeInconclusive:
180
+ # Transient during healing — strip thinking for safety, next
181
+ # call will either succeed or surface the real error.
182
+ session.model_effective_effort[model] = None
183
+ logger.info("healed: %s probe inconclusive — stripped", model)
184
+
185
+ return _resolve_llm_params(
186
+ model,
187
+ session.hf_token,
188
+ reasoning_effort=session.effective_effort_for(model),
189
+ )
190
+
191
+
192
  def _friendly_error_message(error: Exception) -> str | None:
193
  """Return a user-friendly message for known error types, or None to fall back to traceback."""
194
  err_str = str(error).lower()
 
296
  async def _call_llm_streaming(session: Session, messages, tools, llm_params) -> LLMResult:
297
  """Call the LLM with streaming, emitting assistant_chunk events."""
298
  response = None
299
+ _healed_effort = False # one-shot safety net per call
300
+ messages, tools = with_prompt_caching(messages, tools, llm_params.get("model"))
301
  for _llm_attempt in range(_MAX_LLM_RETRIES):
302
  try:
303
  response = await acompletion(
 
313
  except ContextWindowExceededError:
314
  raise
315
  except Exception as e:
316
+ if not _healed_effort and _is_effort_config_error(e):
317
+ _healed_effort = True
318
+ llm_params = await _heal_effort_and_rebuild_params(session, e, llm_params)
319
+ await session.send_event(Event(
320
+ event_type="tool_log",
321
+ data={"tool": "system", "log": "Reasoning effort not supported for this model — adjusting and retrying."},
322
+ ))
323
+ continue
324
  if _llm_attempt < _MAX_LLM_RETRIES - 1 and _is_transient_error(e):
325
  _delay = _LLM_RETRY_DELAYS[_llm_attempt]
326
  logger.warning(
 
391
  async def _call_llm_non_streaming(session: Session, messages, tools, llm_params) -> LLMResult:
392
  """Call the LLM without streaming, emit assistant_message at the end."""
393
  response = None
394
+ _healed_effort = False
395
+ messages, tools = with_prompt_caching(messages, tools, llm_params.get("model"))
396
  for _llm_attempt in range(_MAX_LLM_RETRIES):
397
  try:
398
  response = await acompletion(
 
407
  except ContextWindowExceededError:
408
  raise
409
  except Exception as e:
410
+ if not _healed_effort and _is_effort_config_error(e):
411
+ _healed_effort = True
412
+ llm_params = await _heal_effort_and_rebuild_params(session, e, llm_params)
413
+ await session.send_event(Event(
414
+ event_type="tool_log",
415
+ data={"tool": "system", "log": "Reasoning effort not supported for this model — adjusting and retrying."},
416
+ ))
417
+ continue
418
  if _llm_attempt < _MAX_LLM_RETRIES - 1 and _is_transient_error(e):
419
  _delay = _LLM_RETRY_DELAYS[_llm_attempt]
420
  logger.warning(
 
563
  tools = session.tool_router.get_tool_specs_for_llm()
564
  try:
565
  # ── Call the LLM (streaming or non-streaming) ──
566
+ # Pull the per-model probed effort from the session cache when
567
+ # available; fall back to the raw preference for models we
568
+ # haven't probed yet (e.g. research sub-model).
569
  llm_params = _resolve_llm_params(
570
  session.config.model_name,
571
  session.hf_token,
572
+ reasoning_effort=session.effective_effort_for(session.config.model_name),
573
  )
574
  if session.stream:
575
  llm_result = await _call_llm_streaming(session, messages, tools, llm_params)
agent/core/effort_probe.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Probe-and-cascade for reasoning effort on /model switch.
2
+
3
+ We don't maintain a per-model capability table. Instead, the first time a
4
+ user picks a model we fire a 1-token ping with the same params we'd use
5
+ for real and walk down a cascade (``max`` → ``xhigh`` → ``high`` → …)
6
+ until the provider stops rejecting us. The result is cached per-model on
7
+ the session, so real messages don't pay the probe cost again.
8
+
9
+ Three outcomes, classified from the 400 error text:
10
+
11
+ * success → cache the effort that worked
12
+ * ``"thinking ... not supported"`` → model doesn't do thinking at all;
13
+ cache ``None`` so we stop sending thinking params
14
+ * ``"effort ... invalid"`` / synonyms → cascade walks down and retries
15
+
16
+ Transient errors (5xx, timeout, connection reset) bubble out as
17
+ ``ProbeInconclusive`` so the caller can complete the switch with a
18
+ warning instead of blocking on a flaky provider.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import asyncio
24
+ import logging
25
+ from dataclasses import dataclass
26
+
27
+ from litellm import acompletion
28
+
29
+ from agent.core.llm_params import UnsupportedEffortError, _resolve_llm_params
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ # Cascade: for each user-stated preference, the ordered list of levels to
35
+ # try. First success wins. ``max`` / ``xhigh`` are Anthropic-only; providers
36
+ # that don't accept them raise ``UnsupportedEffortError`` synchronously (no
37
+ # wasted network round-trip) and we advance to the next level.
38
+ _EFFORT_CASCADE: dict[str, list[str]] = {
39
+ "max": ["max", "xhigh", "high", "medium", "low"],
40
+ "xhigh": ["xhigh", "high", "medium", "low"],
41
+ "high": ["high", "medium", "low"],
42
+ "medium": ["medium", "low"],
43
+ "minimal": ["minimal", "low"],
44
+ "low": ["low"],
45
+ }
46
+
47
+ _PROBE_TIMEOUT = 15.0
48
+ _PROBE_MAX_TOKENS = 16
49
+
50
+
51
+ class ProbeInconclusive(Exception):
52
+ """The probe couldn't reach a verdict (transient network / provider error).
53
+
54
+ Caller should complete the switch with a warning — the next real call
55
+ will re-surface the error if it's persistent.
56
+ """
57
+
58
+
59
+ @dataclass
60
+ class ProbeOutcome:
61
+ """What the probe learned. ``effective_effort`` semantics match the cache:
62
+
63
+ * str → send this level
64
+ * None → model doesn't support thinking; strip it
65
+ """
66
+ effective_effort: str | None
67
+ attempts: int
68
+ elapsed_ms: int
69
+ note: str | None = None # e.g. "max not supported, falling back"
70
+
71
+
72
+ def _is_thinking_unsupported(e: Exception) -> bool:
73
+ """Model rejected any thinking config.
74
+
75
+ Matches Anthropic's 'thinking.type.enabled is not supported for this
76
+ model' as well as the adaptive variant. Substring-match because the
77
+ exact wording shifts across API versions.
78
+ """
79
+ s = str(e).lower()
80
+ return "thinking" in s and "not supported" in s
81
+
82
+
83
+ def _is_invalid_effort(e: Exception) -> bool:
84
+ """The requested effort level isn't accepted for this model.
85
+
86
+ Covers both API responses (Anthropic/OpenAI 400 with "invalid", "must
87
+ be one of", etc.) and LiteLLM's local validation that fires *before*
88
+ the request (e.g. "effort='max' is only supported by Claude Opus 4.6"
89
+ — LiteLLM knows max is Opus-4.6-only and raises synchronously). The
90
+ cascade walks down on either.
91
+
92
+ Explicitly returns False when the message is really about thinking
93
+ itself (e.g. Anthropic's 4.7 error mentions ``output_config.effort``
94
+ in its fix hint, but the actual failure is ``thinking.type.enabled``
95
+ being unsupported). That case is caught by ``_is_thinking_unsupported``.
96
+ """
97
+ if _is_thinking_unsupported(e):
98
+ return False
99
+ s = str(e).lower()
100
+ if "effort" not in s and "output_config" not in s:
101
+ return False
102
+ return any(
103
+ phrase in s
104
+ for phrase in (
105
+ "invalid", "not supported", "must be one of", "not a valid",
106
+ "unrecognized", "unknown",
107
+ # LiteLLM's own pre-flight validation phrasing.
108
+ "only supported by", "is only supported",
109
+ )
110
+ )
111
+
112
+
113
+ def _is_transient(e: Exception) -> bool:
114
+ """Network / provider-side flake. Keep in sync with agent_loop's list.
115
+
116
+ Also matches by type for ``asyncio.TimeoutError`` — its ``str(e)`` is
117
+ empty, so substring matching alone misses it.
118
+ """
119
+ if isinstance(e, (asyncio.TimeoutError, TimeoutError)):
120
+ return True
121
+ s = str(e).lower()
122
+ return any(
123
+ p in s
124
+ for p in (
125
+ "timeout", "timed out", "429", "rate limit",
126
+ "503", "service unavailable", "502", "bad gateway",
127
+ "500", "internal server error", "overloaded", "capacity",
128
+ "connection reset", "connection refused", "connection error",
129
+ "eof", "broken pipe",
130
+ )
131
+ )
132
+
133
+
134
+ async def probe_effort(
135
+ model_name: str,
136
+ preference: str | None,
137
+ hf_token: str | None,
138
+ ) -> ProbeOutcome:
139
+ """Walk the cascade for ``preference`` on ``model_name``.
140
+
141
+ Returns the first effort the provider accepts, or ``None`` if it
142
+ rejects thinking altogether. Raises ``ProbeInconclusive`` only for
143
+ transient errors (5xx, timeout) — persistent 4xx that aren't thinking/
144
+ effort related bubble as the original exception so callers can surface
145
+ them (auth, model-not-found, quota, etc.).
146
+ """
147
+ loop = asyncio.get_event_loop()
148
+ start = loop.time()
149
+ attempts = 0
150
+
151
+ if not preference:
152
+ # User explicitly turned effort off — nothing to probe. A bare
153
+ # ping with no thinking params is pointless; just report "off".
154
+ return ProbeOutcome(effective_effort=None, attempts=0, elapsed_ms=0)
155
+
156
+ cascade = _EFFORT_CASCADE.get(preference, [preference])
157
+ skipped: list[str] = [] # levels the provider rejected synchronously
158
+
159
+ last_error: Exception | None = None
160
+ for effort in cascade:
161
+ try:
162
+ params = _resolve_llm_params(
163
+ model_name, hf_token, reasoning_effort=effort, strict=True,
164
+ )
165
+ except UnsupportedEffortError:
166
+ # Provider can't even accept this effort name (e.g. "max" on
167
+ # HF router). Skip without a network call.
168
+ skipped.append(effort)
169
+ continue
170
+
171
+ attempts += 1
172
+ try:
173
+ await asyncio.wait_for(
174
+ acompletion(
175
+ messages=[{"role": "user", "content": "ping"}],
176
+ max_tokens=_PROBE_MAX_TOKENS,
177
+ stream=False,
178
+ **params,
179
+ ),
180
+ timeout=_PROBE_TIMEOUT,
181
+ )
182
+ except Exception as e:
183
+ last_error = e
184
+ if _is_thinking_unsupported(e):
185
+ elapsed = int((loop.time() - start) * 1000)
186
+ return ProbeOutcome(
187
+ effective_effort=None,
188
+ attempts=attempts,
189
+ elapsed_ms=elapsed,
190
+ note="model doesn't support reasoning, dropped",
191
+ )
192
+ if _is_invalid_effort(e):
193
+ logger.debug("probe: %s rejected effort=%s, trying next", model_name, effort)
194
+ continue
195
+ if _is_transient(e):
196
+ raise ProbeInconclusive(str(e)) from e
197
+ # Persistent non-thinking 4xx (auth, quota, model-not-found) —
198
+ # let the caller classify & surface.
199
+ raise
200
+ else:
201
+ elapsed = int((loop.time() - start) * 1000)
202
+ note = None
203
+ if effort != preference:
204
+ note = f"{preference} not supported, using {effort}"
205
+ return ProbeOutcome(
206
+ effective_effort=effort,
207
+ attempts=attempts,
208
+ elapsed_ms=elapsed,
209
+ note=note,
210
+ )
211
+
212
+ # Cascade exhausted without a success. This only happens when every
213
+ # level was either rejected synchronously (``UnsupportedEffortError``,
214
+ # e.g. preference=max on HF and we also somehow filtered all others)
215
+ # or the provider 400'd ``invalid effort`` on every level.
216
+ elapsed = int((loop.time() - start) * 1000)
217
+ if last_error is not None and not _is_invalid_effort(last_error):
218
+ raise last_error
219
+ note = (
220
+ "no effort level accepted — proceeding without thinking"
221
+ if not skipped
222
+ else f"provider rejected all efforts ({', '.join(skipped)})"
223
+ )
224
+ return ProbeOutcome(
225
+ effective_effort=None,
226
+ attempts=attempts,
227
+ elapsed_ms=elapsed,
228
+ note=note,
229
+ )
agent/core/llm_params.py CHANGED
@@ -8,41 +8,122 @@ creating circular imports.
8
  import os
9
 
10
 
11
- # HF router reasoning models only accept "low" | "medium" | "high" (e.g.
12
- # MiniMax M2 actually *requires* reasoning to be enabled). OpenAI's GPT-5
13
- # also accepts "minimal" for near-zero thinking. We map "minimal" to "low"
14
- # for HF so the user doesn't get a 400.
15
- _HF_ALLOWED_EFFORTS = {"low", "medium", "high"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
 
18
  def _resolve_llm_params(
19
  model_name: str,
20
  session_hf_token: str | None = None,
21
  reasoning_effort: str | None = None,
 
22
  ) -> dict:
23
  """
24
  Build LiteLLM kwargs for a given model id.
25
 
26
- • ``anthropic/<model>`` / ``openai/<model>`` passed straight through; the
27
- user's own ``ANTHROPIC_API_KEY`` / ``OPENAI_API_KEY`` env vars are picked
28
- up by LiteLLM. ``reasoning_effort`` is forwarded as a top-level param
29
- (GPT-5 / o-series accept "minimal" | "low" | "medium" | "high"; Claude
30
- extended-thinking models accept "low" | "medium" | "high" and LiteLLM
31
- translates to the thinking config).
 
 
 
 
 
 
 
 
 
32
 
33
  • Anything else is treated as a HuggingFace router id. We hit the
34
  auto-routing OpenAI-compatible endpoint at
35
- ``https://router.huggingface.co/v1``, which bypasses LiteLLM's stale
36
- per-provider HF adapter entirely. The id can be bare or carry an HF
37
- routing suffix:
38
-
39
- MiniMaxAI/MiniMax-M2.7 # auto = fastest + failover
40
- MiniMaxAI/MiniMax-M2.7:cheapest
41
- moonshotai/Kimi-K2.6:novita # pin a specific provider
42
 
43
- A leading ``huggingface/`` is stripped for convenience. ``reasoning_effort``
44
- is forwarded via ``extra_body`` (LiteLLM's OpenAI adapter refuses it as a
45
- top-level kwarg for non-OpenAI models). "minimal" is normalized to "low".
 
 
 
46
 
47
  Token precedence (first non-empty wins):
48
  1. INFERENCE_TOKEN env — shared key on the hosted Space (inference is
@@ -50,10 +131,39 @@ def _resolve_llm_params(
50
  2. session.hf_token — the user's own token (CLI / OAuth / cache file).
51
  3. HF_TOKEN env — belt-and-suspenders fallback for CLI users.
52
  """
53
- if model_name.startswith(("anthropic/", "openai/")):
54
  params: dict = {"model": model_name}
55
  if reasoning_effort:
56
- params["reasoning_effort"] = reasoning_effort
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  return params
58
 
59
  hf_model = model_name.removeprefix("huggingface/")
@@ -72,6 +182,11 @@ def _resolve_llm_params(
72
  params["extra_headers"] = {"X-HF-Bill-To": bill_to}
73
  if reasoning_effort:
74
  hf_level = "low" if reasoning_effort == "minimal" else reasoning_effort
75
- if hf_level in _HF_ALLOWED_EFFORTS:
 
 
 
 
 
76
  params["extra_body"] = {"reasoning_effort": hf_level}
77
  return params
 
8
  import os
9
 
10
 
11
+ def _patch_litellm_effort_validation() -> None:
12
+ """Neuter LiteLLM 1.83's hardcoded effort-level validation.
13
+
14
+ Context: at ``litellm/llms/anthropic/chat/transformation.py:~1443`` the
15
+ Anthropic adapter validates ``output_config.effort ∈ {high, medium,
16
+ low, max}`` and gates ``max`` behind an ``_is_opus_4_6_model`` check
17
+ that only matches the substring ``opus-4-6`` / ``opus_4_6``. Result:
18
+
19
+ * ``xhigh`` — valid on Anthropic's real API for Claude 4.7 — is
20
+ rejected pre-flight with "Invalid effort value: xhigh".
21
+ * ``max`` on Opus 4.7 is rejected with "effort='max' is only supported
22
+ by Claude Opus 4.6", even though Opus 4.7 accepts it in practice.
23
+
24
+ We don't want to maintain a parallel model table, so we let the
25
+ Anthropic API itself be the validator: widen ``_is_opus_4_6_model``
26
+ to also match ``opus-4-7``+ families, and drop the valid-effort-set
27
+ check entirely. If Anthropic rejects an effort level, we see a 400
28
+ and the cascade walks down — exactly the behavior we want for any
29
+ future model family.
30
+
31
+ Removable once litellm ships 1.83.8-stable (which merges PR #25867,
32
+ "Litellm day 0 opus 4.7 support") — see commit 0868a82 on their main
33
+ branch. Until then, this one-time patch is the escape hatch.
34
+ """
35
+ try:
36
+ from litellm.llms.anthropic.chat import transformation as _t
37
+ except Exception:
38
+ return
39
+
40
+ cfg = getattr(_t, "AnthropicConfig", None)
41
+ if cfg is None:
42
+ return
43
+
44
+ original = getattr(cfg, "_is_opus_4_6_model", None)
45
+ if original is None or getattr(original, "_hf_agent_patched", False):
46
+ return
47
+
48
+ def _widened(model: str) -> bool:
49
+ m = model.lower()
50
+ # Original 4.6 match plus any future Opus >= 4.6. We only need this
51
+ # to return True for families where "max" / "xhigh" are acceptable
52
+ # at the API; the cascade handles the case when they're not.
53
+ return any(
54
+ v in m for v in (
55
+ "opus-4-6", "opus_4_6", "opus-4.6", "opus_4.6",
56
+ "opus-4-7", "opus_4_7", "opus-4.7", "opus_4.7",
57
+ )
58
+ )
59
+
60
+ _widened._hf_agent_patched = True # type: ignore[attr-defined]
61
+ cfg._is_opus_4_6_model = staticmethod(_widened)
62
+
63
+
64
+ _patch_litellm_effort_validation()
65
+
66
+
67
+ # Effort levels accepted on the wire.
68
+ # Anthropic (4.6+): low | medium | high | xhigh | max (output_config.effort)
69
+ # OpenAI direct: minimal | low | medium | high (reasoning_effort top-level)
70
+ # HF router: low | medium | high (extra_body.reasoning_effort)
71
+ #
72
+ # We validate *shape* here and let the probe cascade walk down on rejection;
73
+ # we deliberately do NOT maintain a per-model capability table.
74
+ _ANTHROPIC_EFFORTS = {"low", "medium", "high", "xhigh", "max"}
75
+ _OPENAI_EFFORTS = {"minimal", "low", "medium", "high"}
76
+ _HF_EFFORTS = {"low", "medium", "high"}
77
+
78
+
79
+ class UnsupportedEffortError(ValueError):
80
+ """The requested effort isn't valid for this provider's API surface.
81
+
82
+ Raised synchronously before any network call so the probe cascade can
83
+ skip levels the provider can't accept (e.g. ``max`` on HF router).
84
+ """
85
 
86
 
87
  def _resolve_llm_params(
88
  model_name: str,
89
  session_hf_token: str | None = None,
90
  reasoning_effort: str | None = None,
91
+ strict: bool = False,
92
  ) -> dict:
93
  """
94
  Build LiteLLM kwargs for a given model id.
95
 
96
+ • ``anthropic/<model>`` native thinking config. We bypass LiteLLM's
97
+ ``reasoning_effort`` ``thinking`` mapping (which lags new Claude
98
+ releases like 4.7 and sends the wrong API shape). Instead we pass
99
+ both ``thinking={"type": "adaptive"}`` and ``output_config=
100
+ {"effort": <level>}`` as top-level kwargs LiteLLM's Anthropic
101
+ adapter forwards unknown top-level kwargs into the request body
102
+ verbatim (confirmed by live probe; ``extra_body`` does NOT work
103
+ here because Anthropic's API rejects it as "Extra inputs are not
104
+ permitted"). This is the stable API for 4.6 and 4.7. Older
105
+ extended-thinking models that only accept ``thinking.type.enabled``
106
+ will reject this; the probe's cascade catches that and falls back
107
+ to no thinking.
108
+
109
+ • ``openai/<model>`` — ``reasoning_effort`` forwarded as a top-level
110
+ kwarg (GPT-5 / o-series). LiteLLM uses the user's ``OPENAI_API_KEY``.
111
 
112
  • Anything else is treated as a HuggingFace router id. We hit the
113
  auto-routing OpenAI-compatible endpoint at
114
+ ``https://router.huggingface.co/v1``. The id can be bare or carry an
115
+ HF routing suffix (``:fastest`` / ``:cheapest`` / ``:<provider>``).
116
+ A leading ``huggingface/`` is stripped. ``reasoning_effort`` is
117
+ forwarded via ``extra_body`` (LiteLLM's OpenAI adapter refuses it as
118
+ a top-level kwarg for non-OpenAI models). "minimal" normalizes to
119
+ "low".
 
120
 
121
+ ``strict=True`` raises ``UnsupportedEffortError`` when the requested
122
+ effort isn't in the provider's accepted set, instead of silently
123
+ dropping it. The probe cascade uses strict mode so it can walk down
124
+ (``max`` → ``xhigh`` → ``high`` …) without making an API call. Regular
125
+ runtime callers leave ``strict=False``, so a stale cached effort
126
+ can't crash a turn — it just doesn't get sent.
127
 
128
  Token precedence (first non-empty wins):
129
  1. INFERENCE_TOKEN env — shared key on the hosted Space (inference is
 
131
  2. session.hf_token — the user's own token (CLI / OAuth / cache file).
132
  3. HF_TOKEN env — belt-and-suspenders fallback for CLI users.
133
  """
134
+ if model_name.startswith("anthropic/"):
135
  params: dict = {"model": model_name}
136
  if reasoning_effort:
137
+ level = reasoning_effort
138
+ if level == "minimal":
139
+ level = "low"
140
+ if level not in _ANTHROPIC_EFFORTS:
141
+ if strict:
142
+ raise UnsupportedEffortError(
143
+ f"Anthropic doesn't accept effort={level!r}"
144
+ )
145
+ else:
146
+ # Adaptive thinking + output_config.effort is the stable
147
+ # Anthropic API for Claude 4.6 / 4.7. Both kwargs are
148
+ # passed top-level: LiteLLM forwards unknown params into
149
+ # the request body for Anthropic, so ``output_config``
150
+ # reaches the API. ``extra_body`` does NOT work here —
151
+ # Anthropic rejects it as "Extra inputs are not
152
+ # permitted".
153
+ params["thinking"] = {"type": "adaptive"}
154
+ params["output_config"] = {"effort": level}
155
+ return params
156
+
157
+ if model_name.startswith("openai/"):
158
+ params = {"model": model_name}
159
+ if reasoning_effort:
160
+ if reasoning_effort not in _OPENAI_EFFORTS:
161
+ if strict:
162
+ raise UnsupportedEffortError(
163
+ f"OpenAI doesn't accept effort={reasoning_effort!r}"
164
+ )
165
+ else:
166
+ params["reasoning_effort"] = reasoning_effort
167
  return params
168
 
169
  hf_model = model_name.removeprefix("huggingface/")
 
182
  params["extra_headers"] = {"X-HF-Bill-To": bill_to}
183
  if reasoning_effort:
184
  hf_level = "low" if reasoning_effort == "minimal" else reasoning_effort
185
+ if hf_level not in _HF_EFFORTS:
186
+ if strict:
187
+ raise UnsupportedEffortError(
188
+ f"HF router doesn't accept effort={hf_level!r}"
189
+ )
190
+ else:
191
  params["extra_body"] = {"reasoning_effort": hf_level}
192
  return params
agent/core/model_switcher.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Model-switching logic for the interactive CLI's ``/model`` command.
2
+
3
+ Split out of ``agent.main`` so the REPL dispatcher stays focused on input
4
+ parsing. Exposes:
5
+
6
+ * ``SUGGESTED_MODELS`` — the short list shown by ``/model`` with no arg.
7
+ * ``is_valid_model_id`` — loose format check on user input.
8
+ * ``probe_and_switch_model`` — async: checks routing, fires a 1-token
9
+ probe to resolve the effort cascade, then commits the switch (or
10
+ rejects it on hard error).
11
+
12
+ The probe's cascade lives in ``agent.core.effort_probe``; this module
13
+ glues it to CLI output + session state.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from agent.core.effort_probe import ProbeInconclusive, probe_effort
19
+
20
+
21
+ # Suggested models shown by `/model` (not a gate). Users can paste any HF
22
+ # model id (e.g. "MiniMaxAI/MiniMax-M2.7") or an `anthropic/` / `openai/`
23
+ # prefix for direct API access. For HF ids, append ":fastest" /
24
+ # ":cheapest" / ":preferred" / ":<provider>" to override the default
25
+ # routing policy (auto = fastest with failover).
26
+ SUGGESTED_MODELS = [
27
+ {"id": "anthropic/claude-opus-4-7", "label": "Claude Opus 4.7"},
28
+ {"id": "anthropic/claude-opus-4-6", "label": "Claude Opus 4.6"},
29
+ {"id": "MiniMaxAI/MiniMax-M2.7", "label": "MiniMax M2.7"},
30
+ {"id": "moonshotai/Kimi-K2.6", "label": "Kimi K2.6"},
31
+ {"id": "zai-org/GLM-5.1", "label": "GLM 5.1"},
32
+ ]
33
+
34
+
35
+ _ROUTING_POLICIES = {"fastest", "cheapest", "preferred"}
36
+
37
+
38
+ def is_valid_model_id(model_id: str) -> bool:
39
+ """Loose format check — lets users pick any model id.
40
+
41
+ Accepts:
42
+ • anthropic/<model>
43
+ • openai/<model>
44
+ • <org>/<model>[:<tag>] (HF router; tag = provider or policy)
45
+ • huggingface/<org>/<model>[:<tag>] (same, accepts legacy prefix)
46
+
47
+ Actual availability is verified against the HF router catalog on
48
+ switch, and by the provider on the probe's ping call.
49
+ """
50
+ if not model_id or "/" not in model_id:
51
+ return False
52
+ head = model_id.split(":", 1)[0]
53
+ parts = head.split("/")
54
+ return len(parts) >= 2 and all(parts)
55
+
56
+
57
+ def _print_hf_routing_info(model_id: str, console) -> bool:
58
+ """Show HF router catalog info (providers, price, context, tool support)
59
+ for an HF-router model id. Returns ``True`` to signal the caller can
60
+ proceed with the switch, ``False`` to indicate a hard problem the user
61
+ should notice before we fire the effort probe.
62
+
63
+ Anthropic / OpenAI ids return ``True`` without printing anything —
64
+ the probe below covers "does this model exist".
65
+ """
66
+ if model_id.startswith(("anthropic/", "openai/")):
67
+ return True
68
+
69
+ from agent.core import hf_router_catalog as cat
70
+
71
+ bare, _, tag = model_id.partition(":")
72
+ info = cat.lookup(bare)
73
+ if info is None:
74
+ console.print(
75
+ f"[bold red]Warning:[/bold red] '{bare}' isn't in the HF router "
76
+ "catalog. Checking anyway — first call may fail."
77
+ )
78
+ suggestions = cat.fuzzy_suggest(bare)
79
+ if suggestions:
80
+ console.print(f"[dim]Did you mean: {', '.join(suggestions)}[/dim]")
81
+ return True
82
+
83
+ live = info.live_providers
84
+ if not live:
85
+ console.print(
86
+ f"[bold red]Warning:[/bold red] '{bare}' has no live providers "
87
+ "right now. First call will likely fail."
88
+ )
89
+ return True
90
+
91
+ if tag and tag not in _ROUTING_POLICIES:
92
+ matched = [p for p in live if p.provider == tag]
93
+ if not matched:
94
+ names = ", ".join(p.provider for p in live)
95
+ console.print(
96
+ f"[bold red]Warning:[/bold red] provider '{tag}' doesn't serve "
97
+ f"'{bare}'. Live providers: {names}. Checking anyway."
98
+ )
99
+
100
+ if not info.any_supports_tools:
101
+ console.print(
102
+ f"[bold red]Warning:[/bold red] no provider for '{bare}' advertises "
103
+ "tool-call support. This agent relies on tool calls — expect errors."
104
+ )
105
+
106
+ if tag in _ROUTING_POLICIES:
107
+ policy = tag
108
+ elif tag:
109
+ policy = f"pinned to {tag}"
110
+ else:
111
+ policy = "auto (fastest)"
112
+ console.print(f" [dim]routing: {policy}[/dim]")
113
+ for p in live:
114
+ price = (
115
+ f"${p.input_price:g}/${p.output_price:g} per M tok"
116
+ if p.input_price is not None and p.output_price is not None
117
+ else "price n/a"
118
+ )
119
+ ctx = f"{p.context_length:,} ctx" if p.context_length else "ctx n/a"
120
+ tools = "tools" if p.supports_tools else "no tools"
121
+ console.print(
122
+ f" [dim]{p.provider}: {price}, {ctx}, {tools}[/dim]"
123
+ )
124
+ return True
125
+
126
+
127
+ def print_model_listing(config, console) -> None:
128
+ """Render the default ``/model`` (no-arg) view: current + suggested."""
129
+ current = config.model_name if config else ""
130
+ console.print("[bold]Current model:[/bold]")
131
+ console.print(f" {current}")
132
+ console.print("\n[bold]Suggested:[/bold]")
133
+ for m in SUGGESTED_MODELS:
134
+ marker = " [dim]<-- current[/dim]" if m["id"] == current else ""
135
+ console.print(f" {m['id']} [dim]({m['label']})[/dim]{marker}")
136
+ console.print(
137
+ "\n[dim]Paste any HF model id (e.g. 'MiniMaxAI/MiniMax-M2.7').\n"
138
+ "Add ':fastest', ':cheapest', ':preferred', or ':<provider>' to override routing.\n"
139
+ "Use 'anthropic/<model>' or 'openai/<model>' for direct API access.[/dim]"
140
+ )
141
+
142
+
143
+ def print_invalid_id(arg: str, console) -> None:
144
+ console.print(f"[bold red]Invalid model id format:[/bold red] {arg}")
145
+ console.print(
146
+ "[dim]Expected:\n"
147
+ " • <org>/<model>[:tag] (HF router — paste from huggingface.co)\n"
148
+ " • anthropic/<model>\n"
149
+ " • openai/<model>[/dim]"
150
+ )
151
+
152
+
153
+ async def probe_and_switch_model(
154
+ model_id: str,
155
+ config,
156
+ session,
157
+ console,
158
+ hf_token: str | None,
159
+ ) -> None:
160
+ """Validate model+effort with a 1-token ping, cache the effective effort,
161
+ then commit the switch.
162
+
163
+ Three visible outcomes:
164
+
165
+ * ✓ ``effort: <level>`` — model accepted the preferred effort (or a
166
+ fallback from the cascade; the note explains if so)
167
+ * ✓ ``effort: off`` — model doesn't support thinking; we'll strip it
168
+ * ✗ hard error (auth, model-not-found, quota) — we reject the switch
169
+ and keep the current model so the user isn't stranded
170
+
171
+ Transient errors (5xx, timeout) complete the switch with a yellow
172
+ warning; the next real call re-surfaces the error if it's persistent.
173
+ """
174
+ preference = config.reasoning_effort
175
+ if not _print_hf_routing_info(model_id, console):
176
+ return
177
+
178
+ if not preference:
179
+ # Nothing to validate with a ping that we couldn't validate on the
180
+ # first real call just as cheaply. Skip the probe entirely.
181
+ _commit_switch(model_id, config, session, effective=None, cache=False)
182
+ console.print(f"[green]Model switched to {model_id}[/green] [dim](effort: off)[/dim]")
183
+ return
184
+
185
+ console.print(f"[dim]checking {model_id} (effort: {preference})...[/dim]")
186
+ try:
187
+ outcome = await probe_effort(model_id, preference, hf_token)
188
+ except ProbeInconclusive as e:
189
+ _commit_switch(model_id, config, session, effective=None, cache=False)
190
+ console.print(
191
+ f"[yellow]Model switched to {model_id}[/yellow] "
192
+ f"[dim](couldn't validate: {e}; will verify on first message)[/dim]"
193
+ )
194
+ return
195
+ except Exception as e:
196
+ # Hard persistent error — auth, unknown model, quota. Don't switch.
197
+ console.print(f"[bold red]Switch failed:[/bold red] {e}")
198
+ console.print(f"[dim]Keeping current model: {config.model_name}[/dim]")
199
+ return
200
+
201
+ _commit_switch(
202
+ model_id, config, session,
203
+ effective=outcome.effective_effort, cache=True,
204
+ )
205
+ effort_label = outcome.effective_effort or "off"
206
+ suffix = f" — {outcome.note}" if outcome.note else ""
207
+ console.print(
208
+ f"[green]Model switched to {model_id}[/green] "
209
+ f"[dim](effort: {effort_label}{suffix}, {outcome.elapsed_ms}ms)[/dim]"
210
+ )
211
+
212
+
213
+ def _commit_switch(model_id, config, session, effective, cache: bool) -> None:
214
+ """Apply the switch to the session (or bare config if no session yet).
215
+
216
+ ``effective`` is the probe's resolved effort; ``cache=True`` stores it
217
+ in the session's per-model cache so real calls use the resolved level
218
+ instead of re-probing. ``cache=False`` (inconclusive probe / effort
219
+ off) leaves the cache untouched — next call falls back to preference.
220
+ """
221
+ if session is not None:
222
+ session.update_model(model_id)
223
+ if cache:
224
+ session.model_effective_effort[model_id] = effective
225
+ else:
226
+ session.model_effective_effort.pop(model_id, None)
227
+ else:
228
+ config.model_name = model_id
agent/core/prompt_caching.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Anthropic prompt caching breakpoints for outgoing LLM requests.
2
+
3
+ Caching is GA on Anthropic's API and natively supported by litellm >=1.83
4
+ via ``cache_control`` blocks. We apply two breakpoints (out of 4 allowed):
5
+
6
+ 1. The tool block — caches all tool definitions as a single prefix.
7
+ 2. The system message — caches the rendered system prompt.
8
+
9
+ Together these cover the ~4-5K static tokens that were being re-billed on
10
+ every turn. Subsequent turns within the 5-minute TTL hit cache_read pricing
11
+ (~10% of input cost) instead of full input.
12
+
13
+ Non-Anthropic models (HF router, OpenAI) are passed through unchanged.
14
+ """
15
+
16
+ from typing import Any
17
+
18
+
19
+ def with_prompt_caching(
20
+ messages: list[Any],
21
+ tools: list[dict] | None,
22
+ model_name: str | None,
23
+ ) -> tuple[list[Any], list[dict] | None]:
24
+ """Return (messages, tools) with cache_control breakpoints for Anthropic.
25
+
26
+ No-op for non-Anthropic models. Original objects are not mutated; a fresh
27
+ list with replaced first message and last tool is returned, so callers
28
+ that share the underlying ``ContextManager.items`` list don't see their
29
+ persisted history rewritten.
30
+ """
31
+ if not model_name or not model_name.startswith("anthropic/"):
32
+ return messages, tools
33
+
34
+ if tools:
35
+ new_tools = list(tools)
36
+ last = dict(new_tools[-1])
37
+ last["cache_control"] = {"type": "ephemeral"}
38
+ new_tools[-1] = last
39
+ tools = new_tools
40
+
41
+ if messages:
42
+ first = messages[0]
43
+ role = first.get("role") if isinstance(first, dict) else getattr(first, "role", None)
44
+ if role == "system":
45
+ content = (
46
+ first.get("content")
47
+ if isinstance(first, dict)
48
+ else getattr(first, "content", None)
49
+ )
50
+ if isinstance(content, str) and content:
51
+ cached_block = [{
52
+ "type": "text",
53
+ "text": content,
54
+ "cache_control": {"type": "ephemeral"},
55
+ }]
56
+ new_first = {"role": "system", "content": cached_block}
57
+ messages = [new_first] + list(messages[1:])
58
+
59
+ return messages, tools
agent/core/session.py CHANGED
@@ -109,6 +109,16 @@ class Session:
109
  self.turn_count: int = 0
110
  self.last_auto_save_turn: int = 0
111
 
 
 
 
 
 
 
 
 
 
 
112
  async def send_event(self, event: Event) -> None:
113
  """Send event back to client and log to trajectory"""
114
  await self.event_queue.put(event)
@@ -139,6 +149,19 @@ class Session:
139
  self.config.model_name = model_name
140
  self.context_manager.model_max_tokens = _get_max_tokens_safe(model_name)
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  def increment_turn(self) -> None:
143
  """Increment turn counter (called after each user interaction)"""
144
  self.turn_count += 1
 
109
  self.turn_count: int = 0
110
  self.last_auto_save_turn: int = 0
111
 
112
+ # Per-model probed reasoning-effort cache. Populated by the probe
113
+ # on /model switch, read by ``effective_effort_for`` below. Keys are
114
+ # raw model ids (including any ``:tag``). Values:
115
+ # str → the effort level to send (may be a downgrade from the
116
+ # preference, e.g. "high" when user asked for "max")
117
+ # None → model rejected all efforts in the cascade; send no
118
+ # thinking params at all
119
+ # Key absent → not probed yet; fall back to the raw preference.
120
+ self.model_effective_effort: dict[str, str | None] = {}
121
+
122
  async def send_event(self, event: Event) -> None:
123
  """Send event back to client and log to trajectory"""
124
  await self.event_queue.put(event)
 
149
  self.config.model_name = model_name
150
  self.context_manager.model_max_tokens = _get_max_tokens_safe(model_name)
151
 
152
+ def effective_effort_for(self, model_name: str) -> str | None:
153
+ """Resolve the effort level to actually send for ``model_name``.
154
+
155
+ Returns the probed result when we have one (may be ``None`` meaning
156
+ "model doesn't do thinking, strip it"), else the raw preference.
157
+ Unknown-model case falls back to the preference so a stale cache
158
+ from a prior ``/model`` can't poison research sub-calls that use a
159
+ different model id.
160
+ """
161
+ if model_name in self.model_effective_effort:
162
+ return self.model_effective_effort[model_name]
163
+ return self.config.reasoning_effort
164
+
165
  def increment_turn(self) -> None:
166
  """Increment turn counter (called after each user interaction)"""
167
  self.turn_count += 1
agent/main.py CHANGED
@@ -22,6 +22,7 @@ from prompt_toolkit import PromptSession
22
 
23
  from agent.config import load_config
24
  from agent.core.agent_loop import submission_loop
 
25
  from agent.core.session import OpType
26
  from agent.core.tools import ToolRouter
27
  from agent.utils.reliability_checks import check_training_script_save_pattern
@@ -49,39 +50,6 @@ litellm.drop_params = True
49
  # on every error — users don't need it, and our friendly errors cover the case.
50
  litellm.suppress_debug_info = True
51
 
52
- # ── Suggested models shown by `/model` (not a gate) ──────────────────────
53
- # Users can paste any HF model id (e.g. "MiniMaxAI/MiniMax-M2.7") or use one
54
- # of the `anthropic/` / `openai/` prefixes for direct API access. For HF ids,
55
- # append ":fastest" / ":cheapest" / ":preferred" / ":<provider>" to override
56
- # the default routing policy (auto = fastest with failover).
57
- SUGGESTED_MODELS = [
58
- {"id": "anthropic/claude-opus-4-6", "label": "Claude Opus 4.6"},
59
- {"id": "MiniMaxAI/MiniMax-M2.7", "label": "MiniMax M2.7"},
60
- {"id": "moonshotai/Kimi-K2.6", "label": "Kimi K2.6"},
61
- {"id": "zai-org/GLM-5.1", "label": "GLM 5.1"},
62
- ]
63
-
64
-
65
- def _is_valid_model_id(model_id: str) -> bool:
66
- """Loose format check — lets users pick any model id.
67
-
68
- Accepts:
69
- • anthropic/<model>
70
- • openai/<model>
71
- • <org>/<model>[:<tag>] (HF router; tag = provider or policy)
72
- • huggingface/<org>/<model>[:<tag>] (same, accepts legacy prefix)
73
-
74
- Actual availability is verified against the HF router catalog on switch,
75
- or by the provider on first call.
76
- """
77
- if not model_id or "/" not in model_id:
78
- return False
79
- # Strip :tag suffix before structural check
80
- head = model_id.split(":", 1)[0]
81
- parts = head.split("/")
82
- return len(parts) >= 2 and all(parts)
83
-
84
-
85
  def _safe_get_args(arguments: dict) -> dict:
86
  """Safely extract args dict from arguments, handling cases where LLM passes string."""
87
  args = arguments.get("args", {})
@@ -91,80 +59,6 @@ def _safe_get_args(arguments: dict) -> dict:
91
  return args if isinstance(args, dict) else {}
92
 
93
 
94
- _ROUTING_POLICIES = {"fastest", "cheapest", "preferred"}
95
-
96
-
97
- def _print_model_preflight(model_id: str, console) -> None:
98
- """Validate a model switch against the HF router catalog and show the
99
- user what they're about to use (providers, price, context, tool support).
100
-
101
- Anthropic/OpenAI ids skip the catalog — those are direct API calls.
102
- For unknown HF ids we print a red warning with fuzzy suggestions but
103
- still allow the switch (the catalog might be lagging).
104
- """
105
- if model_id.startswith(("anthropic/", "openai/")):
106
- console.print(f"[green]Model switched to {model_id}[/green]")
107
- return
108
-
109
- from agent.core import hf_router_catalog as cat
110
-
111
- bare, _, tag = model_id.partition(":")
112
- info = cat.lookup(bare)
113
- if info is None:
114
- console.print(
115
- f"[bold red]Warning:[/bold red] '{bare}' isn't in the HF router "
116
- "catalog. Switching anyway — first call may fail."
117
- )
118
- suggestions = cat.fuzzy_suggest(bare)
119
- if suggestions:
120
- console.print(f"[dim]Did you mean: {', '.join(suggestions)}[/dim]")
121
- return
122
-
123
- live = info.live_providers
124
- if not live:
125
- console.print(
126
- f"[bold red]Warning:[/bold red] '{bare}' has no live providers "
127
- "right now. First call will likely fail."
128
- )
129
- return
130
-
131
- if tag and tag not in _ROUTING_POLICIES:
132
- matched = [p for p in live if p.provider == tag]
133
- if not matched:
134
- names = ", ".join(p.provider for p in live)
135
- console.print(
136
- f"[bold red]Warning:[/bold red] provider '{tag}' doesn't serve "
137
- f"'{bare}'. Live providers: {names}. Switching anyway."
138
- )
139
- return
140
-
141
- if not info.any_supports_tools:
142
- console.print(
143
- f"[bold red]Warning:[/bold red] no provider for '{bare}' advertises "
144
- "tool-call support. This agent relies on tool calls — expect errors."
145
- )
146
-
147
- console.print(f"[green]Model switched to {model_id}[/green]")
148
- if tag in _ROUTING_POLICIES:
149
- policy = tag
150
- elif tag:
151
- policy = f"pinned to {tag}"
152
- else:
153
- policy = "auto (fastest)"
154
- console.print(f" [dim]routing: {policy}[/dim]")
155
- for p in live:
156
- price = (
157
- f"${p.input_price:g}/${p.output_price:g} per M tok"
158
- if p.input_price is not None and p.output_price is not None
159
- else "price n/a"
160
- )
161
- ctx = f"{p.context_length:,} ctx" if p.context_length else "ctx n/a"
162
- tools = "tools" if p.supports_tools else "no tools"
163
- console.print(
164
- f" [dim]{p.provider}: {price}, {ctx}, {tools}[/dim]"
165
- )
166
-
167
-
168
  def _get_hf_token() -> str | None:
169
  """Get HF token from environment, huggingface_hub API, or cached token file."""
170
  token = os.environ.get("HF_TOKEN")
@@ -807,7 +701,7 @@ async def get_user_input(prompt_session: PromptSession) -> str:
807
  # Slash commands are defined in terminal_display
808
 
809
 
810
- def _handle_slash_command(
811
  cmd: str,
812
  config,
813
  session_holder: list,
@@ -817,6 +711,9 @@ def _handle_slash_command(
817
  """
818
  Handle a slash command. Returns a Submission to enqueue, or None if
819
  the command was handled locally (caller should set turn_complete_event).
 
 
 
820
  """
821
  parts = cmd.strip().split(None, 1)
822
  command = parts[0].lower()
@@ -843,35 +740,16 @@ def _handle_slash_command(
843
  if command == "/model":
844
  console = get_console()
845
  if not arg:
846
- current = config.model_name if config else ""
847
- console.print("[bold]Current model:[/bold]")
848
- console.print(f" {current}")
849
- console.print("\n[bold]Suggested:[/bold]")
850
- for m in SUGGESTED_MODELS:
851
- marker = " [dim]<-- current[/dim]" if m["id"] == current else ""
852
- console.print(f" {m['id']} [dim]({m['label']})[/dim]{marker}")
853
- console.print(
854
- "\n[dim]Paste any HF model id (e.g. 'MiniMaxAI/MiniMax-M2.7').\n"
855
- "Add ':fastest', ':cheapest', ':preferred', or ':<provider>' to override routing.\n"
856
- "Use 'anthropic/<model>' or 'openai/<model>' for direct API access.[/dim]"
857
- )
858
  return None
859
- if not _is_valid_model_id(arg):
860
- console.print(f"[bold red]Invalid model id format:[/bold red] {arg}")
861
- console.print(
862
- "[dim]Expected:\n"
863
- " • <org>/<model>[:tag] (HF router — paste from huggingface.co)\n"
864
- " • anthropic/<model>\n"
865
- " • openai/<model>[/dim]"
866
- )
867
  return None
868
  normalized = arg.removeprefix("huggingface/")
869
- _print_model_preflight(normalized, console)
870
  session = session_holder[0] if session_holder else None
871
- if session:
872
- session.update_model(normalized)
873
- else:
874
- config.model_name = normalized
875
  return None
876
 
877
  if command == "/yolo":
@@ -882,14 +760,19 @@ def _handle_slash_command(
882
 
883
  if command == "/effort":
884
  console = get_console()
885
- valid = {"minimal", "low", "medium", "high", "off"}
 
886
  if not arg:
887
  current = config.reasoning_effort or "off"
888
- console.print(f"[bold]Reasoning effort:[/bold] {current}")
 
 
 
 
889
  console.print(
890
- "[dim]Set with '/effort minimal|low|medium|high|off'. "
891
- "Applies to models that support it (GPT-5 / o-series, Claude "
892
- "extended thinking, HF reasoning models); dropped otherwise.[/dim]"
893
  )
894
  return None
895
  level = arg.lower()
@@ -898,7 +781,16 @@ def _handle_slash_command(
898
  console.print(f"[dim]Expected one of: {', '.join(sorted(valid))}[/dim]")
899
  return None
900
  config.reasoning_effort = None if level == "off" else level
 
 
 
 
901
  console.print(f"[green]Reasoning effort: {level}[/green]")
 
 
 
 
 
902
  return None
903
 
904
  if command == "/status":
@@ -1083,7 +975,7 @@ async def main():
1083
 
1084
  # Handle slash commands
1085
  if user_input.strip().startswith("/"):
1086
- sub = _handle_slash_command(
1087
  user_input.strip(), config, session_holder, submission_queue, submission_id
1088
  )
1089
  if sub is None:
 
22
 
23
  from agent.config import load_config
24
  from agent.core.agent_loop import submission_loop
25
+ from agent.core import model_switcher
26
  from agent.core.session import OpType
27
  from agent.core.tools import ToolRouter
28
  from agent.utils.reliability_checks import check_training_script_save_pattern
 
50
  # on every error — users don't need it, and our friendly errors cover the case.
51
  litellm.suppress_debug_info = True
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  def _safe_get_args(arguments: dict) -> dict:
54
  """Safely extract args dict from arguments, handling cases where LLM passes string."""
55
  args = arguments.get("args", {})
 
59
  return args if isinstance(args, dict) else {}
60
 
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  def _get_hf_token() -> str | None:
63
  """Get HF token from environment, huggingface_hub API, or cached token file."""
64
  token = os.environ.get("HF_TOKEN")
 
701
  # Slash commands are defined in terminal_display
702
 
703
 
704
+ async def _handle_slash_command(
705
  cmd: str,
706
  config,
707
  session_holder: list,
 
711
  """
712
  Handle a slash command. Returns a Submission to enqueue, or None if
713
  the command was handled locally (caller should set turn_complete_event).
714
+
715
+ Async because ``/model`` fires a probe ping to validate the model+effort
716
+ combo before committing the switch.
717
  """
718
  parts = cmd.strip().split(None, 1)
719
  command = parts[0].lower()
 
740
  if command == "/model":
741
  console = get_console()
742
  if not arg:
743
+ model_switcher.print_model_listing(config, console)
 
 
 
 
 
 
 
 
 
 
 
744
  return None
745
+ if not model_switcher.is_valid_model_id(arg):
746
+ model_switcher.print_invalid_id(arg, console)
 
 
 
 
 
 
747
  return None
748
  normalized = arg.removeprefix("huggingface/")
 
749
  session = session_holder[0] if session_holder else None
750
+ await model_switcher.probe_and_switch_model(
751
+ normalized, config, session, console, _get_hf_token(),
752
+ )
 
753
  return None
754
 
755
  if command == "/yolo":
 
760
 
761
  if command == "/effort":
762
  console = get_console()
763
+ valid = {"minimal", "low", "medium", "high", "xhigh", "max", "off"}
764
+ session = session_holder[0] if session_holder else None
765
  if not arg:
766
  current = config.reasoning_effort or "off"
767
+ console.print(f"[bold]Reasoning effort preference:[/bold] {current}")
768
+ if session and session.model_effective_effort:
769
+ console.print("[dim]Probed per model:[/dim]")
770
+ for m, eff in session.model_effective_effort.items():
771
+ console.print(f" [dim]{m}: {eff or 'off'}[/dim]")
772
  console.print(
773
+ "[dim]Set with '/effort minimal|low|medium|high|xhigh|max|off'. "
774
+ "'max' and 'xhigh' are Anthropic-only; the cascade falls back "
775
+ "to whatever the model actually accepts.[/dim]"
776
  )
777
  return None
778
  level = arg.lower()
 
781
  console.print(f"[dim]Expected one of: {', '.join(sorted(valid))}[/dim]")
782
  return None
783
  config.reasoning_effort = None if level == "off" else level
784
+ # Drop the per-model probe cache — the new preference may resolve
785
+ # differently. Next ``/model`` (or the retry safety net) reprobes.
786
+ if session is not None:
787
+ session.model_effective_effort.clear()
788
  console.print(f"[green]Reasoning effort: {level}[/green]")
789
+ if session is not None:
790
+ console.print(
791
+ "[dim]run /model <current> to re-probe, or send a message — "
792
+ "the agent adjusts automatically if the new level isn't supported.[/dim]"
793
+ )
794
  return None
795
 
796
  if command == "/status":
 
975
 
976
  # Handle slash commands
977
  if user_input.strip().startswith("/"):
978
+ sub = await _handle_slash_command(
979
  user_input.strip(), config, session_holder, submission_queue, submission_id
980
  )
981
  if sub is None:
agent/tools/research_tool.py CHANGED
@@ -15,6 +15,7 @@ from litellm import Message, acompletion
15
 
16
  from agent.core.doom_loop import check_for_doom_loop
17
  from agent.core.llm_params import _resolve_llm_params
 
18
  from agent.core.session import Event
19
 
20
  logger = logging.getLogger(__name__)
@@ -246,10 +247,16 @@ async def research_handler(
246
  # Use a cheaper/faster model for research
247
  main_model = session.config.model_name
248
  research_model = _get_research_model(main_model)
 
 
 
 
 
 
249
  llm_params = _resolve_llm_params(
250
  research_model,
251
  getattr(session, "hf_token", None),
252
- reasoning_effort=getattr(session.config, "reasoning_effort", None),
253
  )
254
 
255
  # Get read-only tool specs from the session's tool router
@@ -317,8 +324,9 @@ async def research_handler(
317
  ),
318
  ))
319
  try:
 
320
  response = await acompletion(
321
- messages=messages,
322
  tools=None, # no tools — force text response
323
  stream=False,
324
  timeout=120,
@@ -342,9 +350,12 @@ async def research_handler(
342
  ))
343
 
344
  try:
 
 
 
345
  response = await acompletion(
346
- messages=messages,
347
- tools=tool_specs if tool_specs else None,
348
  tool_choice="auto",
349
  stream=False,
350
  timeout=120,
@@ -440,8 +451,9 @@ async def research_handler(
440
  ),
441
  ))
442
  try:
 
443
  response = await acompletion(
444
- messages=messages,
445
  tools=None,
446
  stream=False,
447
  timeout=120,
 
15
 
16
  from agent.core.doom_loop import check_for_doom_loop
17
  from agent.core.llm_params import _resolve_llm_params
18
+ from agent.core.prompt_caching import with_prompt_caching
19
  from agent.core.session import Event
20
 
21
  logger = logging.getLogger(__name__)
 
247
  # Use a cheaper/faster model for research
248
  main_model = session.config.model_name
249
  research_model = _get_research_model(main_model)
250
+ # Research is a cheap sub-call — cap the main session's effort at "high"
251
+ # so a user preference of ``max``/``xhigh`` (valid for Opus 4.6/4.7) doesn't
252
+ # propagate to a Sonnet research model that may not accept those levels.
253
+ # We also haven't probed this sub-model so we don't know its ceiling.
254
+ _pref = getattr(session.config, "reasoning_effort", None)
255
+ _capped = "high" if _pref in ("max", "xhigh") else _pref
256
  llm_params = _resolve_llm_params(
257
  research_model,
258
  getattr(session, "hf_token", None),
259
+ reasoning_effort=_capped,
260
  )
261
 
262
  # Get read-only tool specs from the session's tool router
 
324
  ),
325
  ))
326
  try:
327
+ _msgs, _ = with_prompt_caching(messages, None, llm_params.get("model"))
328
  response = await acompletion(
329
+ messages=_msgs,
330
  tools=None, # no tools — force text response
331
  stream=False,
332
  timeout=120,
 
350
  ))
351
 
352
  try:
353
+ _msgs, _tools = with_prompt_caching(
354
+ messages, tool_specs if tool_specs else None, llm_params.get("model")
355
+ )
356
  response = await acompletion(
357
+ messages=_msgs,
358
+ tools=_tools,
359
  tool_choice="auto",
360
  stream=False,
361
  timeout=120,
 
451
  ),
452
  ))
453
  try:
454
+ _msgs, _ = with_prompt_caching(messages, None, llm_params.get("model"))
455
  response = await acompletion(
456
+ messages=_msgs,
457
  tools=None,
458
  stream=False,
459
  timeout=120,
agent/utils/terminal_display.py CHANGED
@@ -440,7 +440,7 @@ HELP_TEXT = f"""\
440
  {_I} [cyan]/undo[/cyan] Undo last turn
441
  {_I} [cyan]/compact[/cyan] Compact context window
442
  {_I} [cyan]/model[/cyan] [id] Show available models or switch
443
- {_I} [cyan]/effort[/cyan] [level] Reasoning effort (minimal|low|medium|high|off)
444
  {_I} [cyan]/yolo[/cyan] Toggle auto-approve mode
445
  {_I} [cyan]/status[/cyan] Current model & turn count
446
  {_I} [cyan]/quit[/cyan] Exit"""
 
440
  {_I} [cyan]/undo[/cyan] Undo last turn
441
  {_I} [cyan]/compact[/cyan] Compact context window
442
  {_I} [cyan]/model[/cyan] [id] Show available models or switch
443
+ {_I} [cyan]/effort[/cyan] [level] Reasoning effort (minimal|low|medium|high|xhigh|max|off)
444
  {_I} [cyan]/yolo[/cyan] Toggle auto-approve mode
445
  {_I} [cyan]/status[/cyan] Current model & turn count
446
  {_I} [cyan]/quit[/cyan] Exit"""
backend/dependencies.py CHANGED
@@ -16,6 +16,7 @@ logger = logging.getLogger(__name__)
16
 
17
  OPENID_PROVIDER_URL = os.environ.get("OPENID_PROVIDER_URL", "https://huggingface.co")
18
  AUTH_ENABLED = bool(os.environ.get("OAUTH_CLIENT_ID", ""))
 
19
 
20
  # Simple in-memory token cache: token -> (user_info, expiry_time)
21
  _token_cache: dict[str, tuple[dict[str, Any], float]] = {}
@@ -28,8 +29,13 @@ DEV_USER: dict[str, Any] = {
28
  "user_id": "dev",
29
  "username": "dev",
30
  "authenticated": True,
 
31
  }
32
 
 
 
 
 
33
 
34
  async def _validate_token(token: str) -> dict[str, Any] | None:
35
  """Validate a token against HF OAuth userinfo endpoint.
@@ -74,12 +80,86 @@ def _user_from_info(user_info: dict[str, Any]) -> dict[str, Any]:
74
  }
75
 
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  async def _extract_user_from_token(token: str) -> dict[str, Any] | None:
78
  """Validate a token and return a user dict, or None."""
79
  user_info = await _validate_token(token)
80
- if user_info:
81
- return _user_from_info(user_info)
82
- return None
 
 
83
 
84
 
85
  async def check_org_membership(token: str, org_name: str) -> bool:
@@ -141,3 +221,29 @@ async def get_current_user(request: Request) -> dict[str, Any]:
141
  )
142
 
143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  OPENID_PROVIDER_URL = os.environ.get("OPENID_PROVIDER_URL", "https://huggingface.co")
18
  AUTH_ENABLED = bool(os.environ.get("OAUTH_CLIENT_ID", ""))
19
+ HF_EMPLOYEE_ORG = os.environ.get("HF_EMPLOYEE_ORG", "huggingface")
20
 
21
  # Simple in-memory token cache: token -> (user_info, expiry_time)
22
  _token_cache: dict[str, tuple[dict[str, Any], float]] = {}
 
29
  "user_id": "dev",
30
  "username": "dev",
31
  "authenticated": True,
32
+ "plan": "org", # Dev runs at the Pro/Org quota tier so local testing isn't capped.
33
  }
34
 
35
+ # Plan field discovery — log the whoami-v2 shape once at DEBUG so we can
36
+ # confirm the actual key in production without hammering the HF API.
37
+ _WHOAMI_SHAPE_LOGGED = False
38
+
39
 
40
  async def _validate_token(token: str) -> dict[str, Any] | None:
41
  """Validate a token against HF OAuth userinfo endpoint.
 
80
  }
81
 
82
 
83
+ def _normalize_plan(whoami: dict[str, Any]) -> str:
84
+ """Map an HF /api/whoami-v2 payload to one of: 'free' | 'pro' | 'org'.
85
+
86
+ The exact field shape in whoami-v2 isn't documented for our purposes,
87
+ so we try a handful of likely keys and fall back to 'free'. The first
88
+ call logs the raw shape at DEBUG (see `_fetch_user_plan`) so we can
89
+ pin the real key post-deploy.
90
+ """
91
+ plan_str = ""
92
+ for key in ("plan", "type", "accountType"):
93
+ val = whoami.get(key)
94
+ if isinstance(val, str) and val:
95
+ plan_str = val.lower()
96
+ break
97
+
98
+ if not plan_str:
99
+ if whoami.get("isPro") is True or whoami.get("is_pro") is True:
100
+ return "pro"
101
+
102
+ if "pro" in plan_str or "enterprise" in plan_str or "team" in plan_str:
103
+ return "pro"
104
+
105
+ # Org tier: anyone in a paid / enterprise org. We don't pay for this
106
+ # right now, but the "pro" cap applies identically.
107
+ orgs = whoami.get("orgs") or []
108
+ if isinstance(orgs, list):
109
+ for org in orgs:
110
+ if isinstance(org, dict):
111
+ org_plan = str(org.get("plan") or org.get("type") or "").lower()
112
+ if "pro" in org_plan or "enterprise" in org_plan or "team" in org_plan:
113
+ return "org"
114
+
115
+ return "free"
116
+
117
+
118
+ async def _fetch_user_plan(token: str) -> str:
119
+ """Look up the user's HF plan via /api/whoami-v2.
120
+
121
+ Returns 'free' | 'pro' | 'org'. Non-200, network errors, or an unknown
122
+ payload shape all collapse to 'free' — safe default; we'd rather under-
123
+ grant the Pro cap than over-grant it on bad data.
124
+ """
125
+ global _WHOAMI_SHAPE_LOGGED
126
+ async with httpx.AsyncClient(timeout=5.0) as client:
127
+ try:
128
+ resp = await client.get(
129
+ f"{OPENID_PROVIDER_URL}/api/whoami-v2",
130
+ headers={"Authorization": f"Bearer {token}"},
131
+ )
132
+ if resp.status_code != 200:
133
+ return "free"
134
+ whoami = resp.json()
135
+ except httpx.HTTPError:
136
+ return "free"
137
+ except ValueError:
138
+ return "free"
139
+
140
+ if not _WHOAMI_SHAPE_LOGGED:
141
+ _WHOAMI_SHAPE_LOGGED = True
142
+ logger.debug(
143
+ "whoami-v2 payload keys: %s (sample values: plan=%r type=%r isPro=%r)",
144
+ sorted(whoami.keys()) if isinstance(whoami, dict) else type(whoami).__name__,
145
+ whoami.get("plan") if isinstance(whoami, dict) else None,
146
+ whoami.get("type") if isinstance(whoami, dict) else None,
147
+ whoami.get("isPro") if isinstance(whoami, dict) else None,
148
+ )
149
+
150
+ if not isinstance(whoami, dict):
151
+ return "free"
152
+ return _normalize_plan(whoami)
153
+
154
+
155
  async def _extract_user_from_token(token: str) -> dict[str, Any] | None:
156
  """Validate a token and return a user dict, or None."""
157
  user_info = await _validate_token(token)
158
+ if user_info is None:
159
+ return None
160
+ user = _user_from_info(user_info)
161
+ user["plan"] = await _fetch_user_plan(token)
162
+ return user
163
 
164
 
165
  async def check_org_membership(token: str, org_name: str) -> bool:
 
221
  )
222
 
223
 
224
+ def _extract_token(request: Request) -> str | None:
225
+ """Pull the HF access token from the Authorization header or cookie.
226
+
227
+ Mirrors the lookup order used by ``get_current_user``.
228
+ """
229
+ auth_header = request.headers.get("Authorization", "")
230
+ if auth_header.startswith("Bearer "):
231
+ return auth_header[7:]
232
+ return request.cookies.get("hf_access_token")
233
+
234
+
235
+ async def require_huggingface_org_member(request: Request) -> bool:
236
+ """Return True if the caller is a member of the ``huggingface`` org.
237
+
238
+ Used to gate endpoints that can push a session onto an Anthropic model
239
+ billed to the Space's ``ANTHROPIC_API_KEY``. Returns True unconditionally
240
+ in dev mode so local testing isn't blocked.
241
+ """
242
+ if not AUTH_ENABLED:
243
+ return True
244
+ token = _extract_token(request)
245
+ if not token:
246
+ return False
247
+ return await check_org_membership(token, HF_EMPLOYEE_ORG)
248
+
249
+
backend/routes/agent.py CHANGED
@@ -10,7 +10,7 @@ import logging
10
  import os
11
  from typing import Any
12
 
13
- from dependencies import get_current_user
14
  from fastapi import (
15
  APIRouter,
16
  Depends,
@@ -28,7 +28,9 @@ from models import (
28
  SubmitRequest,
29
  TruncateRequest,
30
  )
31
- from session_manager import MAX_SESSIONS, SessionCapacityError, session_manager
 
 
32
 
33
  from agent.core.llm_params import _resolve_llm_params
34
 
@@ -37,31 +39,99 @@ logger = logging.getLogger(__name__)
37
  router = APIRouter(prefix="/api", tags=["agent"])
38
 
39
  AVAILABLE_MODELS = [
 
 
 
 
 
 
 
40
  {
41
  "id": "anthropic/claude-opus-4-6",
42
  "label": "Claude Opus 4.6",
43
  "provider": "anthropic",
 
44
  "recommended": True,
45
  },
46
  {
47
  "id": "MiniMaxAI/MiniMax-M2.7",
48
  "label": "MiniMax M2.7",
49
  "provider": "huggingface",
50
- "recommended": True,
51
- },
52
- {
53
- "id": "moonshotai/Kimi-K2.6",
54
- "label": "Kimi K2.6",
55
- "provider": "huggingface",
56
  },
57
  {
58
  "id": "zai-org/GLM-5.1",
59
  "label": "GLM 5.1",
60
  "provider": "huggingface",
 
61
  },
62
  ]
63
 
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  def _check_session_access(session_id: str, user: dict[str, Any]) -> None:
66
  """Verify the user has access to the given session. Raises 403 or 404."""
67
  info = session_manager.get_session_info(session_id)
@@ -143,20 +213,6 @@ async def get_model() -> dict:
143
  }
144
 
145
 
146
- @router.post("/config/model")
147
- async def set_model(body: dict, user: dict = Depends(get_current_user)) -> dict:
148
- """Set the LLM model. Applies to new conversations."""
149
- model_id = body.get("model")
150
- if not model_id:
151
- raise HTTPException(status_code=400, detail="Missing 'model' field")
152
- valid_ids = {m["id"] for m in AVAILABLE_MODELS}
153
- if model_id not in valid_ids:
154
- raise HTTPException(status_code=400, detail=f"Unknown model: {model_id}")
155
- session_manager.config.model_name = model_id
156
- logger.info(f"Model changed to {model_id} by {user.get('username', 'unknown')}")
157
- return {"model": model_id}
158
-
159
-
160
  _TITLE_STRIP_CHARS = str.maketrans("", "", "`*_~#[]()")
161
 
162
 
@@ -224,6 +280,10 @@ async def create_session(
224
  and stored in the session so that tools (e.g. hf_jobs) can act on
225
  behalf of the user.
226
 
 
 
 
 
227
  Returns 503 if the server or user has reached the session limit.
228
  """
229
  # Extract the user's HF token (Bearer header, HttpOnly cookie, or env var)
@@ -236,9 +296,27 @@ async def create_session(
236
  if not hf_token:
237
  hf_token = os.environ.get("HF_TOKEN")
238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  try:
240
  session_id = await session_manager.create_session(
241
- user_id=user["user_id"], hf_token=hf_token
242
  )
243
  except SessionCapacityError as e:
244
  raise HTTPException(status_code=503, detail=str(e))
@@ -254,6 +332,9 @@ async def restore_session_summary(
254
  conversation. The client sends its cached messages; we run the standard
255
  summarization prompt on them and drop the result into the new
256
  session's context as a user-role system note.
 
 
 
257
  """
258
  messages = body.get("messages")
259
  if not isinstance(messages, list) or not messages:
@@ -268,9 +349,17 @@ async def restore_session_summary(
268
  if not hf_token:
269
  hf_token = os.environ.get("HF_TOKEN")
270
 
 
 
 
 
 
 
 
 
271
  try:
272
  session_id = await session_manager.create_session(
273
- user_id=user["user_id"], hf_token=hf_token
274
  )
275
  except SessionCapacityError as e:
276
  raise HTTPException(status_code=503, detail=str(e))
@@ -302,12 +391,19 @@ async def get_session(
302
 
303
  @router.post("/session/{session_id}/model")
304
  async def set_session_model(
305
- session_id: str, body: dict, user: dict = Depends(get_current_user)
 
 
 
306
  ) -> dict:
307
  """Switch the active model for a single session (tab-scoped).
308
 
309
  Takes effect on the next LLM call in that session — other sessions
310
- (including other browser tabs) are unaffected.
 
 
 
 
311
  """
312
  _check_session_access(session_id, user)
313
  model_id = body.get("model")
@@ -316,6 +412,7 @@ async def set_session_model(
316
  valid_ids = {m["id"] for m in AVAILABLE_MODELS}
317
  if model_id not in valid_ids:
318
  raise HTTPException(status_code=400, detail=f"Unknown model: {model_id}")
 
319
  agent_session = session_manager.sessions.get(session_id)
320
  if not agent_session:
321
  raise HTTPException(status_code=404, detail="Session not found")
@@ -327,6 +424,20 @@ async def set_session_model(
327
  return {"session_id": session_id, "model": model_id}
328
 
329
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
  @router.get("/sessions", response_model=list[SessionInfo])
331
  async def list_sessions(user: dict = Depends(get_current_user)) -> list[SessionInfo]:
332
  """List sessions belonging to the authenticated user."""
@@ -352,6 +463,9 @@ async def submit_input(
352
  ) -> dict:
353
  """Submit user input to a session. Only accessible by the session owner."""
354
  _check_session_access(request.session_id, user)
 
 
 
355
  success = await session_manager.submit_user_input(request.session_id, request.text)
356
  if not success:
357
  raise HTTPException(status_code=404, detail="Session not found or inactive")
@@ -404,6 +518,16 @@ async def chat_sse(
404
  text = body.get("text")
405
  approvals = body.get("approvals")
406
 
 
 
 
 
 
 
 
 
 
 
407
  try:
408
  if approvals:
409
  formatted = [
 
10
  import os
11
  from typing import Any
12
 
13
+ from dependencies import get_current_user, require_huggingface_org_member
14
  from fastapi import (
15
  APIRouter,
16
  Depends,
 
28
  SubmitRequest,
29
  TruncateRequest,
30
  )
31
+ from session_manager import MAX_SESSIONS, AgentSession, SessionCapacityError, session_manager
32
+
33
+ import user_quotas
34
 
35
  from agent.core.llm_params import _resolve_llm_params
36
 
 
39
  router = APIRouter(prefix="/api", tags=["agent"])
40
 
41
  AVAILABLE_MODELS = [
42
+ {
43
+ "id": "moonshotai/Kimi-K2.6",
44
+ "label": "Kimi K2.6",
45
+ "provider": "huggingface",
46
+ "tier": "free",
47
+ "recommended": True,
48
+ },
49
  {
50
  "id": "anthropic/claude-opus-4-6",
51
  "label": "Claude Opus 4.6",
52
  "provider": "anthropic",
53
+ "tier": "pro",
54
  "recommended": True,
55
  },
56
  {
57
  "id": "MiniMaxAI/MiniMax-M2.7",
58
  "label": "MiniMax M2.7",
59
  "provider": "huggingface",
60
+ "tier": "free",
 
 
 
 
 
61
  },
62
  {
63
  "id": "zai-org/GLM-5.1",
64
  "label": "GLM 5.1",
65
  "provider": "huggingface",
66
+ "tier": "free",
67
  },
68
  ]
69
 
70
 
71
+ async def _require_hf_for_anthropic(request: Request, model_id: str) -> None:
72
+ """403 if a non-``huggingface``-org user tries to select an Anthropic model.
73
+
74
+ Anthropic models are billed to the Space's ``ANTHROPIC_API_KEY``; every
75
+ other model in ``AVAILABLE_MODELS`` is routed through HF Router and
76
+ billed via ``X-HF-Bill-To``. The gate only fires for ``anthropic/*`` so
77
+ non-HF users can still freely switch between the free models.
78
+
79
+ Pattern: https://github.com/huggingface/ml-intern/pull/63
80
+ """
81
+ if not model_id.startswith("anthropic/"):
82
+ return
83
+ if not await require_huggingface_org_member(request):
84
+ raise HTTPException(
85
+ status_code=403,
86
+ detail={
87
+ "error": "anthropic_restricted",
88
+ "message": (
89
+ "Opus is gated to HF staff. Pick a free model — "
90
+ "Kimi K2.6, MiniMax M2.7, or GLM 5.1 — instead."
91
+ ),
92
+ },
93
+ )
94
+
95
+
96
+ async def _enforce_claude_quota(
97
+ user: dict[str, Any],
98
+ agent_session: AgentSession,
99
+ ) -> None:
100
+ """Charge the user's daily Claude quota on first use of Anthropic in a session.
101
+
102
+ Runs at *message-submit* time, not session-create time — so spinning up a
103
+ Claude session to look around doesn't burn quota. The ``claude_counted``
104
+ flag on ``AgentSession`` guards against re-counting the same session.
105
+
106
+ No-ops when the session's current model isn't Anthropic, or when this
107
+ session has already been charged. Raises 429 when the user has hit
108
+ their daily cap.
109
+ """
110
+ if agent_session.claude_counted:
111
+ return
112
+ model_name = agent_session.session.config.model_name
113
+ if not model_name.startswith("anthropic/"):
114
+ return
115
+ user_id = user["user_id"]
116
+ used = await user_quotas.get_claude_used_today(user_id)
117
+ cap = user_quotas.daily_cap_for(user.get("plan"))
118
+ if used >= cap:
119
+ raise HTTPException(
120
+ status_code=429,
121
+ detail={
122
+ "error": "claude_daily_cap",
123
+ "plan": user.get("plan", "free"),
124
+ "cap": cap,
125
+ "message": (
126
+ "Daily Claude limit reached. Upgrade to HF Pro for "
127
+ f"{user_quotas.CLAUDE_PRO_DAILY}/day or use a free model."
128
+ ),
129
+ },
130
+ )
131
+ await user_quotas.increment_claude(user_id)
132
+ agent_session.claude_counted = True
133
+
134
+
135
  def _check_session_access(session_id: str, user: dict[str, Any]) -> None:
136
  """Verify the user has access to the given session. Raises 403 or 404."""
137
  info = session_manager.get_session_info(session_id)
 
213
  }
214
 
215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  _TITLE_STRIP_CHARS = str.maketrans("", "", "`*_~#[]()")
217
 
218
 
 
280
  and stored in the session so that tools (e.g. hf_jobs) can act on
281
  behalf of the user.
282
 
283
+ Optional body ``{"model"?: <id>}`` selects the session's LLM; unknown
284
+ ids are rejected (400). The Claude-quota gate runs at message-submit
285
+ time, not here — spinning up an Opus session to look around is free.
286
+
287
  Returns 503 if the server or user has reached the session limit.
288
  """
289
  # Extract the user's HF token (Bearer header, HttpOnly cookie, or env var)
 
296
  if not hf_token:
297
  hf_token = os.environ.get("HF_TOKEN")
298
 
299
+ # Optional model override. Empty body falls back to the config default.
300
+ model: str | None = None
301
+ try:
302
+ body = await request.json()
303
+ except Exception:
304
+ body = None
305
+ if isinstance(body, dict):
306
+ model = body.get("model")
307
+
308
+ valid_ids = {m["id"] for m in AVAILABLE_MODELS}
309
+ if model and model not in valid_ids:
310
+ raise HTTPException(status_code=400, detail=f"Unknown model: {model}")
311
+
312
+ # Opus is gated to HF staff (PR #63). Only fires when the resolved model
313
+ # is Anthropic; free models pass through.
314
+ resolved_model = model or session_manager.config.model_name
315
+ await _require_hf_for_anthropic(request, resolved_model)
316
+
317
  try:
318
  session_id = await session_manager.create_session(
319
+ user_id=user["user_id"], hf_token=hf_token, model=model
320
  )
321
  except SessionCapacityError as e:
322
  raise HTTPException(status_code=503, detail=str(e))
 
332
  conversation. The client sends its cached messages; we run the standard
333
  summarization prompt on them and drop the result into the new
334
  session's context as a user-role system note.
335
+
336
+ Optional ``"model"`` in the body overrides the session's LLM. The
337
+ Claude-quota gate runs at message-submit time, not here.
338
  """
339
  messages = body.get("messages")
340
  if not isinstance(messages, list) or not messages:
 
349
  if not hf_token:
350
  hf_token = os.environ.get("HF_TOKEN")
351
 
352
+ model = body.get("model")
353
+ valid_ids = {m["id"] for m in AVAILABLE_MODELS}
354
+ if model and model not in valid_ids:
355
+ raise HTTPException(status_code=400, detail=f"Unknown model: {model}")
356
+
357
+ resolved_model = model or session_manager.config.model_name
358
+ await _require_hf_for_anthropic(request, resolved_model)
359
+
360
  try:
361
  session_id = await session_manager.create_session(
362
+ user_id=user["user_id"], hf_token=hf_token, model=model
363
  )
364
  except SessionCapacityError as e:
365
  raise HTTPException(status_code=503, detail=str(e))
 
391
 
392
  @router.post("/session/{session_id}/model")
393
  async def set_session_model(
394
+ session_id: str,
395
+ body: dict,
396
+ request: Request,
397
+ user: dict = Depends(get_current_user),
398
  ) -> dict:
399
  """Switch the active model for a single session (tab-scoped).
400
 
401
  Takes effect on the next LLM call in that session — other sessions
402
+ (including other browser tabs) are unaffected. Model switches don't
403
+ charge quota — the Claude-quota gate only fires at message-submit time.
404
+
405
+ Switching TO an Anthropic model requires HF org membership (PR #63);
406
+ free-model switches are unrestricted.
407
  """
408
  _check_session_access(session_id, user)
409
  model_id = body.get("model")
 
412
  valid_ids = {m["id"] for m in AVAILABLE_MODELS}
413
  if model_id not in valid_ids:
414
  raise HTTPException(status_code=400, detail=f"Unknown model: {model_id}")
415
+ await _require_hf_for_anthropic(request, model_id)
416
  agent_session = session_manager.sessions.get(session_id)
417
  if not agent_session:
418
  raise HTTPException(status_code=404, detail="Session not found")
 
424
  return {"session_id": session_id, "model": model_id}
425
 
426
 
427
+ @router.get("/user/quota")
428
+ async def get_user_quota(user: dict = Depends(get_current_user)) -> dict:
429
+ """Return the user's plan tier and today's Claude-session quota state."""
430
+ plan = user.get("plan", "free")
431
+ used = await user_quotas.get_claude_used_today(user["user_id"])
432
+ cap = user_quotas.daily_cap_for(plan)
433
+ return {
434
+ "plan": plan,
435
+ "claude_used_today": used,
436
+ "claude_daily_cap": cap,
437
+ "claude_remaining": max(0, cap - used),
438
+ }
439
+
440
+
441
  @router.get("/sessions", response_model=list[SessionInfo])
442
  async def list_sessions(user: dict = Depends(get_current_user)) -> list[SessionInfo]:
443
  """List sessions belonging to the authenticated user."""
 
463
  ) -> dict:
464
  """Submit user input to a session. Only accessible by the session owner."""
465
  _check_session_access(request.session_id, user)
466
+ agent_session = session_manager.sessions.get(request.session_id)
467
+ if agent_session is not None:
468
+ await _enforce_claude_quota(user, agent_session)
469
  success = await session_manager.submit_user_input(request.session_id, request.text)
470
  if not success:
471
  raise HTTPException(status_code=404, detail="Session not found or inactive")
 
518
  text = body.get("text")
519
  approvals = body.get("approvals")
520
 
521
+ # Gate user-message sends against the daily Claude quota. Approvals are
522
+ # continuations of an in-progress turn — the session was already charged
523
+ # on its first message, so we skip the gate there.
524
+ if text is not None and not approvals:
525
+ try:
526
+ await _enforce_claude_quota(user, agent_session)
527
+ except HTTPException:
528
+ broadcaster.unsubscribe(sub_id)
529
+ raise
530
+
531
  try:
532
  if approvals:
533
  formatted = [
backend/session_manager.py CHANGED
@@ -91,6 +91,10 @@ class AgentSession:
91
  is_active: bool = True
92
  is_processing: bool = False # True while a submission is being executed
93
  broadcaster: Any = None
 
 
 
 
94
 
95
 
96
  class SessionCapacityError(Exception):
@@ -126,7 +130,12 @@ class SessionManager:
126
  if s.user_id == user_id and s.is_active
127
  )
128
 
129
- async def create_session(self, user_id: str = "dev", hf_token: str | None = None) -> str:
 
 
 
 
 
130
  """Create a new agent session and return its ID.
131
 
132
  Session() and ToolRouter() constructors contain blocking I/O
@@ -135,6 +144,10 @@ class SessionManager:
135
 
136
  Args:
137
  user_id: The ID of the user who owns this session.
 
 
 
 
138
 
139
  Raises:
140
  SessionCapacityError: If the server or user has reached the
@@ -175,6 +188,8 @@ class SessionManager:
175
  # Deep-copy config so each session's model switches independently —
176
  # tab A picking GLM doesn't flip tab B off Claude.
177
  session_config = self.config.model_copy(deep=True)
 
 
178
  session = Session(
179
  event_queue, config=session_config, tool_router=tool_router,
180
  hf_token=hf_token,
 
91
  is_active: bool = True
92
  is_processing: bool = False # True while a submission is being executed
93
  broadcaster: Any = None
94
+ # True once this session has been counted against the user's daily
95
+ # Claude quota. Guards double-counting when the user re-selects an
96
+ # Anthropic model mid-session.
97
+ claude_counted: bool = False
98
 
99
 
100
  class SessionCapacityError(Exception):
 
130
  if s.user_id == user_id and s.is_active
131
  )
132
 
133
+ async def create_session(
134
+ self,
135
+ user_id: str = "dev",
136
+ hf_token: str | None = None,
137
+ model: str | None = None,
138
+ ) -> str:
139
  """Create a new agent session and return its ID.
140
 
141
  Session() and ToolRouter() constructors contain blocking I/O
 
144
 
145
  Args:
146
  user_id: The ID of the user who owns this session.
147
+ hf_token: The user's HF OAuth token, stored for tool execution.
148
+ model: Optional model override. When set, replaces ``model_name``
149
+ on the per-session config clone. None falls back to the
150
+ config default.
151
 
152
  Raises:
153
  SessionCapacityError: If the server or user has reached the
 
188
  # Deep-copy config so each session's model switches independently —
189
  # tab A picking GLM doesn't flip tab B off Claude.
190
  session_config = self.config.model_copy(deep=True)
191
+ if model:
192
+ session_config.model_name = model
193
  session = Session(
194
  event_queue, config=session_config, tool_router=tool_router,
195
  hf_token=hf_token,
backend/user_quotas.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """In-memory daily quota for Claude session creations.
2
+
3
+ Tracks per-user Claude session starts against a daily cap derived from the
4
+ user's HF plan. Caps reset at UTC midnight; the store itself is in-process
5
+ and wipes on restart (deliberate — the cost of occasional over-subsidy at
6
+ restart is much lower than running a DB).
7
+
8
+ Unit: session *creations*, not messages. A user who selects Claude in a new
9
+ session consumes one quota point; switching an existing Claude session to
10
+ Claude again doesn't (`AgentSession.claude_counted` guards that).
11
+
12
+ Cap tiers:
13
+ free user → CLAUDE_FREE_DAILY (1)
14
+ pro / org → CLAUDE_PRO_DAILY (20)
15
+ """
16
+
17
+ import asyncio
18
+ import os
19
+ from datetime import UTC, datetime
20
+
21
+ CLAUDE_FREE_DAILY: int = int(os.environ.get("CLAUDE_FREE_DAILY", "1"))
22
+ CLAUDE_PRO_DAILY: int = int(os.environ.get("CLAUDE_PRO_DAILY", "20"))
23
+
24
+ # user_id -> (day_utc_iso, count_for_that_day)
25
+ _claude_counts: dict[str, tuple[str, int]] = {}
26
+ _lock = asyncio.Lock()
27
+
28
+
29
+ def _today() -> str:
30
+ return datetime.now(UTC).date().isoformat()
31
+
32
+
33
+ def daily_cap_for(plan: str | None) -> int:
34
+ """Return the daily Claude-session cap for the given plan."""
35
+ return CLAUDE_FREE_DAILY if (plan or "free") == "free" else CLAUDE_PRO_DAILY
36
+
37
+
38
+ async def get_claude_used_today(user_id: str) -> int:
39
+ """Return today's Claude session count for the user (0 if none / stale day)."""
40
+ async with _lock:
41
+ entry = _claude_counts.get(user_id)
42
+ if entry is None:
43
+ return 0
44
+ day, count = entry
45
+ if day != _today():
46
+ # Stale day — drop the entry so the first increment starts fresh.
47
+ _claude_counts.pop(user_id, None)
48
+ return 0
49
+ return count
50
+
51
+
52
+ async def increment_claude(user_id: str) -> int:
53
+ """Bump today's Claude session count for the user. Returns the new value."""
54
+ async with _lock:
55
+ today = _today()
56
+ day, count = _claude_counts.get(user_id, (today, 0))
57
+ if day != today:
58
+ count = 0
59
+ count += 1
60
+ _claude_counts[user_id] = (today, count)
61
+ return count
62
+
63
+
64
+ async def refund_claude(user_id: str) -> None:
65
+ """Decrement today's count — used when session creation fails after a successful gate."""
66
+ async with _lock:
67
+ entry = _claude_counts.get(user_id)
68
+ if entry is None:
69
+ return
70
+ day, count = entry
71
+ if day != _today():
72
+ _claude_counts.pop(user_id, None)
73
+ return
74
+ new_count = max(0, count - 1)
75
+ if new_count == 0:
76
+ _claude_counts.pop(user_id, None)
77
+ else:
78
+ _claude_counts[user_id] = (day, new_count)
79
+
80
+
81
+ def _reset_for_tests() -> None:
82
+ """Test-only: clear the in-memory store."""
83
+ _claude_counts.clear()
frontend/src/components/Chat/ChatInput.tsx CHANGED
@@ -4,6 +4,10 @@ import ArrowUpwardIcon from '@mui/icons-material/ArrowUpward';
4
  import ArrowDropDownIcon from '@mui/icons-material/ArrowDropDown';
5
  import StopIcon from '@mui/icons-material/Stop';
6
  import { apiFetch } from '@/utils/api';
 
 
 
 
7
 
8
  // Model configuration
9
  interface ModelOption {
@@ -21,6 +25,14 @@ const getHfAvatarUrl = (modelId: string) => {
21
  };
22
 
23
  const MODEL_OPTIONS: ModelOption[] = [
 
 
 
 
 
 
 
 
24
  {
25
  id: 'claude-opus',
26
  name: 'Claude Opus 4.6',
@@ -35,14 +47,6 @@ const MODEL_OPTIONS: ModelOption[] = [
35
  description: 'Novita',
36
  modelPath: 'MiniMaxAI/MiniMax-M2.7',
37
  avatarUrl: getHfAvatarUrl('MiniMaxAI/MiniMax-M2.7'),
38
- recommended: true,
39
- },
40
- {
41
- id: 'kimi-k2.6',
42
- name: 'Kimi K2.6',
43
- description: 'Novita',
44
- modelPath: 'moonshotai/Kimi-K2.6',
45
- avatarUrl: getHfAvatarUrl('moonshotai/Kimi-K2.6'),
46
  },
47
  {
48
  id: 'glm-5.1',
@@ -66,11 +70,23 @@ interface ChatInputProps {
66
  placeholder?: string;
67
  }
68
 
 
 
 
69
  export default function ChatInput({ sessionId, onSend, onStop, isProcessing = false, disabled = false, placeholder = 'Ask anything...' }: ChatInputProps) {
70
  const [input, setInput] = useState('');
71
  const inputRef = useRef<HTMLTextAreaElement>(null);
72
  const [selectedModelId, setSelectedModelId] = useState<string>(MODEL_OPTIONS[0].id);
73
  const [modelAnchorEl, setModelAnchorEl] = useState<null | HTMLElement>(null);
 
 
 
 
 
 
 
 
 
74
 
75
  // Model is per-session: fetch this tab's current model every time the
76
  // session changes. Other tabs keep their own selections independently.
@@ -101,11 +117,27 @@ export default function ChatInput({ sessionId, onSend, onStop, isProcessing = fa
101
 
102
  const handleSend = useCallback(() => {
103
  if (input.trim() && !disabled) {
 
104
  onSend(input);
105
  setInput('');
106
  }
107
  }, [input, disabled, onSend]);
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  const handleKeyDown = useCallback(
110
  (e: KeyboardEvent<HTMLDivElement>) => {
111
  if (e.key === 'Enter' && !e.shiftKey) {
@@ -136,6 +168,45 @@ export default function ChatInput({ sessionId, onSend, onStop, isProcessing = fa
136
  } catch { /* ignore */ }
137
  };
138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  return (
140
  <Box
141
  sx={{
@@ -334,6 +405,19 @@ export default function ChatInput({ sessionId, onSend, onStop, isProcessing = fa
334
  }}
335
  />
336
  )}
 
 
 
 
 
 
 
 
 
 
 
 
 
337
  </Box>
338
  }
339
  secondary={model.description}
@@ -344,6 +428,14 @@ export default function ChatInput({ sessionId, onSend, onStop, isProcessing = fa
344
  </MenuItem>
345
  ))}
346
  </Menu>
 
 
 
 
 
 
 
 
347
  </Box>
348
  </Box>
349
  );
 
4
  import ArrowDropDownIcon from '@mui/icons-material/ArrowDropDown';
5
  import StopIcon from '@mui/icons-material/Stop';
6
  import { apiFetch } from '@/utils/api';
7
+ import { useUserQuota } from '@/hooks/useUserQuota';
8
+ import ClaudeCapDialog from '@/components/ClaudeCapDialog';
9
+ import { useAgentStore } from '@/store/agentStore';
10
+ import { FIRST_FREE_MODEL_PATH } from '@/utils/model';
11
 
12
  // Model configuration
13
  interface ModelOption {
 
25
  };
26
 
27
  const MODEL_OPTIONS: ModelOption[] = [
28
+ {
29
+ id: 'kimi-k2.6',
30
+ name: 'Kimi K2.6',
31
+ description: 'Novita',
32
+ modelPath: 'moonshotai/Kimi-K2.6',
33
+ avatarUrl: getHfAvatarUrl('moonshotai/Kimi-K2.6'),
34
+ recommended: true,
35
+ },
36
  {
37
  id: 'claude-opus',
38
  name: 'Claude Opus 4.6',
 
47
  description: 'Novita',
48
  modelPath: 'MiniMaxAI/MiniMax-M2.7',
49
  avatarUrl: getHfAvatarUrl('MiniMaxAI/MiniMax-M2.7'),
 
 
 
 
 
 
 
 
50
  },
51
  {
52
  id: 'glm-5.1',
 
70
  placeholder?: string;
71
  }
72
 
73
+ const isClaudeModel = (m: ModelOption) => m.modelPath.startsWith('anthropic/');
74
+ const firstFreeModel = () => MODEL_OPTIONS.find(m => !isClaudeModel(m)) ?? MODEL_OPTIONS[0];
75
+
76
  export default function ChatInput({ sessionId, onSend, onStop, isProcessing = false, disabled = false, placeholder = 'Ask anything...' }: ChatInputProps) {
77
  const [input, setInput] = useState('');
78
  const inputRef = useRef<HTMLTextAreaElement>(null);
79
  const [selectedModelId, setSelectedModelId] = useState<string>(MODEL_OPTIONS[0].id);
80
  const [modelAnchorEl, setModelAnchorEl] = useState<null | HTMLElement>(null);
81
+ const { quota, refresh: refreshQuota } = useUserQuota();
82
+ // The daily-cap dialog is triggered from two places: (a) a 429 returned
83
+ // from the chat transport when the user tries to send on Opus over cap —
84
+ // surfaced via the agent-store flag — and (b) nothing else right now
85
+ // (switching models is free). Keeping the open state in the store means
86
+ // the hook layer can flip it without threading props through.
87
+ const claudeQuotaExhausted = useAgentStore((s) => s.claudeQuotaExhausted);
88
+ const setClaudeQuotaExhausted = useAgentStore((s) => s.setClaudeQuotaExhausted);
89
+ const lastSentRef = useRef<string>('');
90
 
91
  // Model is per-session: fetch this tab's current model every time the
92
  // session changes. Other tabs keep their own selections independently.
 
117
 
118
  const handleSend = useCallback(() => {
119
  if (input.trim() && !disabled) {
120
+ lastSentRef.current = input;
121
  onSend(input);
122
  setInput('');
123
  }
124
  }, [input, disabled, onSend]);
125
 
126
+ // When the chat transport reports a Claude-quota 429, restore the typed
127
+ // text so the user doesn't lose their message.
128
+ useEffect(() => {
129
+ if (claudeQuotaExhausted && lastSentRef.current) {
130
+ setInput(lastSentRef.current);
131
+ }
132
+ }, [claudeQuotaExhausted]);
133
+
134
+ // Refresh the quota display whenever the session changes (user might
135
+ // have started another tab that spent quota).
136
+ useEffect(() => {
137
+ if (sessionId) refreshQuota();
138
+ // eslint-disable-next-line react-hooks/exhaustive-deps
139
+ }, [sessionId]);
140
+
141
  const handleKeyDown = useCallback(
142
  (e: KeyboardEvent<HTMLDivElement>) => {
143
  if (e.key === 'Enter' && !e.shiftKey) {
 
168
  } catch { /* ignore */ }
169
  };
170
 
171
+ // Dialog close: just clear the flag. The typed text is already restored.
172
+ const handleCapDialogClose = useCallback(() => {
173
+ setClaudeQuotaExhausted(false);
174
+ }, [setClaudeQuotaExhausted]);
175
+
176
+ // "Use a free model" — switch the current session to Kimi (or the first
177
+ // non-Anthropic option) and auto-retry the send that tripped the cap.
178
+ const handleUseFreeModel = useCallback(async () => {
179
+ setClaudeQuotaExhausted(false);
180
+ if (!sessionId) return;
181
+ const free = MODEL_OPTIONS.find(m => m.modelPath === FIRST_FREE_MODEL_PATH)
182
+ ?? firstFreeModel();
183
+ try {
184
+ const res = await apiFetch(`/api/session/${sessionId}/model`, {
185
+ method: 'POST',
186
+ body: JSON.stringify({ model: free.modelPath }),
187
+ });
188
+ if (res.ok) {
189
+ setSelectedModelId(free.id);
190
+ const retryText = lastSentRef.current;
191
+ if (retryText) {
192
+ onSend(retryText);
193
+ setInput('');
194
+ lastSentRef.current = '';
195
+ }
196
+ }
197
+ } catch { /* ignore */ }
198
+ }, [sessionId, onSend, setClaudeQuotaExhausted]);
199
+
200
+ // Hide the chip until the user has actually burned quota — an unused
201
+ // Opus session shouldn't populate a counter.
202
+ const claudeChip = (() => {
203
+ if (!quota || quota.claudeUsedToday === 0) return null;
204
+ if (quota.plan === 'free') {
205
+ return quota.claudeRemaining > 0 ? 'Free today' : 'Pro only';
206
+ }
207
+ return `${quota.claudeUsedToday}/${quota.claudeDailyCap} today`;
208
+ })();
209
+
210
  return (
211
  <Box
212
  sx={{
 
405
  }}
406
  />
407
  )}
408
+ {isClaudeModel(model) && claudeChip && (
409
+ <Chip
410
+ label={claudeChip}
411
+ size="small"
412
+ sx={{
413
+ height: '18px',
414
+ fontSize: '10px',
415
+ bgcolor: 'rgba(255,255,255,0.08)',
416
+ color: 'var(--muted-text)',
417
+ fontWeight: 600,
418
+ }}
419
+ />
420
+ )}
421
  </Box>
422
  }
423
  secondary={model.description}
 
428
  </MenuItem>
429
  ))}
430
  </Menu>
431
+
432
+ <ClaudeCapDialog
433
+ open={claudeQuotaExhausted}
434
+ plan={quota?.plan ?? 'free'}
435
+ cap={quota?.claudeDailyCap ?? 1}
436
+ onClose={handleCapDialogClose}
437
+ onUseFreeModel={handleUseFreeModel}
438
+ />
439
  </Box>
440
  </Box>
441
  );
frontend/src/components/ClaudeCapDialog.tsx ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {
2
+ Box,
3
+ Button,
4
+ Dialog,
5
+ DialogActions,
6
+ DialogContent,
7
+ DialogContentText,
8
+ DialogTitle,
9
+ Typography,
10
+ } from '@mui/material';
11
+ import type { PlanTier } from '@/hooks/useUserQuota';
12
+
13
+ const HF_PRICING_URL = 'https://huggingface.co/pricing';
14
+ const PRO_CAP = 20;
15
+
16
+ interface ClaudeCapDialogProps {
17
+ open: boolean;
18
+ plan: PlanTier;
19
+ cap: number;
20
+ onClose: () => void;
21
+ onUseFreeModel: () => void;
22
+ }
23
+
24
+ export default function ClaudeCapDialog({
25
+ open,
26
+ plan,
27
+ cap,
28
+ onClose,
29
+ onUseFreeModel,
30
+ }: ClaudeCapDialogProps) {
31
+ // plan not surfaced in copy right now — Pro users see the same dialog and
32
+ // can upgrade their org if they're also capped.
33
+ void plan;
34
+
35
+ return (
36
+ <Dialog
37
+ open={open}
38
+ onClose={onClose}
39
+ slotProps={{
40
+ backdrop: { sx: { backgroundColor: 'rgba(0,0,0,0.5)', backdropFilter: 'blur(4px)' } },
41
+ }}
42
+ PaperProps={{
43
+ sx: {
44
+ bgcolor: 'var(--panel)',
45
+ border: '1px solid var(--border)',
46
+ borderRadius: 'var(--radius-md)',
47
+ boxShadow: 'var(--shadow-1)',
48
+ maxWidth: 460,
49
+ mx: 2,
50
+ },
51
+ }}
52
+ >
53
+ <DialogTitle
54
+ sx={{ color: 'var(--text)', fontWeight: 700, fontSize: '1rem', pt: 2.5, pb: 0, px: 3 }}
55
+ >
56
+ You've hit your Opus limit
57
+ </DialogTitle>
58
+ <DialogContent sx={{ px: 3, pt: 1.25, pb: 0 }}>
59
+ <DialogContentText
60
+ sx={{ color: 'var(--muted-text)', fontSize: '0.85rem', lineHeight: 1.6 }}
61
+ >
62
+ Opus costs an arm and a leg, so we unfortunately have to cap you at {cap}{' '}
63
+ {cap === 1 ? 'session' : 'sessions'} a day. Give Kimi, MiniMax, or GLM a spin —
64
+ they are genuinely good and we use them all the time.
65
+ </DialogContentText>
66
+ <Box
67
+ sx={{
68
+ mt: 2,
69
+ p: 1.5,
70
+ borderRadius: '8px',
71
+ bgcolor: 'var(--accent-yellow-weak)',
72
+ border: '1px solid var(--border)',
73
+ }}
74
+ >
75
+ <Typography
76
+ variant="caption"
77
+ sx={{
78
+ display: 'block',
79
+ fontWeight: 700,
80
+ color: 'var(--text)',
81
+ fontSize: '0.78rem',
82
+ mb: 0.5,
83
+ letterSpacing: '0.02em',
84
+ }}
85
+ >
86
+ HF Pro ($9/mo) — more Opus, more everything
87
+ </Typography>
88
+ <Typography
89
+ variant="caption"
90
+ sx={{ display: 'block', color: 'var(--muted-text)', fontSize: '0.78rem', lineHeight: 1.55 }}
91
+ >
92
+ {PRO_CAP} Opus sessions/day here, 20× HF Inference credits, ZeroGPU access,
93
+ and priority on Spaces hardware.
94
+ </Typography>
95
+ </Box>
96
+ </DialogContent>
97
+ <DialogActions sx={{ px: 3, pb: 2.5, pt: 2, gap: 1 }}>
98
+ <Button
99
+ component="a"
100
+ href={HF_PRICING_URL}
101
+ target="_blank"
102
+ rel="noopener noreferrer"
103
+ variant="contained"
104
+ size="small"
105
+ sx={{
106
+ fontSize: '0.82rem',
107
+ px: 2.5,
108
+ bgcolor: 'var(--accent-yellow)',
109
+ color: '#000',
110
+ textTransform: 'none',
111
+ fontWeight: 700,
112
+ boxShadow: 'none',
113
+ '&:hover': { bgcolor: '#FFB340', boxShadow: 'none' },
114
+ }}
115
+ >
116
+ Upgrade to Pro
117
+ </Button>
118
+ <Button
119
+ onClick={onUseFreeModel}
120
+ size="small"
121
+ sx={{
122
+ color: 'var(--muted-text)',
123
+ fontSize: '0.82rem',
124
+ px: 2,
125
+ textTransform: 'none',
126
+ '&:hover': { bgcolor: 'var(--hover-bg)' },
127
+ }}
128
+ >
129
+ Use a free model
130
+ </Button>
131
+ </DialogActions>
132
+ </Dialog>
133
+ );
134
+ }
frontend/src/hooks/useAgentChat.ts CHANGED
@@ -345,8 +345,16 @@ export function useAgentChat({ sessionId, isActive, onReady, onError, onSessionD
345
  // sendMessages on the transport.
346
  sendAutomaticallyWhen: lastAssistantMessageIsCompleteWithApprovalResponses,
347
  onError: (error) => {
348
- logger.error('useChat error:', error);
349
  updateSession(sessionId, { isProcessing: false });
 
 
 
 
 
 
 
 
 
350
  if (isActiveRef.current) {
351
  useAgentStore.getState().setError(error.message);
352
  }
 
345
  // sendMessages on the transport.
346
  sendAutomaticallyWhen: lastAssistantMessageIsCompleteWithApprovalResponses,
347
  onError: (error) => {
 
348
  updateSession(sessionId, { isProcessing: false });
349
+ // Claude daily-cap: open the cap dialog instead of the generic error
350
+ // banner. Transport marks the error with this sentinel.
351
+ if (error.message === 'CLAUDE_QUOTA_EXHAUSTED') {
352
+ if (isActiveRef.current) {
353
+ useAgentStore.getState().setClaudeQuotaExhausted(true);
354
+ }
355
+ return;
356
+ }
357
+ logger.error('useChat error:', error);
358
  if (isActiveRef.current) {
359
  useAgentStore.getState().setError(error.message);
360
  }
frontend/src/hooks/useUserQuota.ts ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Reads the current user's Claude daily quota + plan tier from the backend.
3
+ *
4
+ * Fetches once when the user becomes authenticated, and exposes a `refresh()`
5
+ * that callers invoke after a successful session-create / model-switch so the
6
+ * chip reflects the new count without a full page reload.
7
+ */
8
+ import { useCallback, useEffect, useState } from 'react';
9
+ import { useAgentStore } from '@/store/agentStore';
10
+ import { apiFetch } from '@/utils/api';
11
+
12
+ export type PlanTier = 'free' | 'pro' | 'org';
13
+
14
+ export interface UserQuota {
15
+ plan: PlanTier;
16
+ claudeUsedToday: number;
17
+ claudeDailyCap: number;
18
+ claudeRemaining: number;
19
+ }
20
+
21
+ export function useUserQuota() {
22
+ const user = useAgentStore((s) => s.user);
23
+ const [quota, setQuota] = useState<UserQuota | null>(null);
24
+ const [loading, setLoading] = useState(false);
25
+
26
+ const refresh = useCallback(async () => {
27
+ if (!user?.authenticated) return;
28
+ setLoading(true);
29
+ try {
30
+ const res = await apiFetch('/api/user/quota');
31
+ if (!res.ok) return;
32
+ const data = await res.json();
33
+ setQuota({
34
+ plan: (data.plan ?? 'free') as PlanTier,
35
+ claudeUsedToday: data.claude_used_today ?? 0,
36
+ claudeDailyCap: data.claude_daily_cap ?? 1,
37
+ claudeRemaining: data.claude_remaining ?? 0,
38
+ });
39
+ } catch {
40
+ /* backend unreachable — leave previous value */
41
+ } finally {
42
+ setLoading(false);
43
+ }
44
+ }, [user?.authenticated]);
45
+
46
+ useEffect(() => {
47
+ refresh();
48
+ }, [refresh]);
49
+
50
+ return { quota, loading, refresh };
51
+ }
frontend/src/lib/sse-chat-transport.ts CHANGED
@@ -356,6 +356,12 @@ export class SSEChatTransport implements ChatTransport<UIMessage> {
356
  // it can flag the session for the catch-up banner.
357
  this.sideChannel.onSessionDead(sessionId);
358
  }
 
 
 
 
 
 
359
  if (!response.ok) {
360
  const errorText = await response.text().catch(() => 'Request failed');
361
  throw new Error(`Chat request failed: ${response.status} ${errorText}`);
 
356
  // it can flag the session for the catch-up banner.
357
  this.sideChannel.onSessionDead(sessionId);
358
  }
359
+ if (response.status === 429) {
360
+ // Claude daily-quota gate tripped. The prefix is the detection marker
361
+ // for useAgentChat's onError handler, which surfaces the cap dialog
362
+ // instead of a generic error banner.
363
+ throw new Error('CLAUDE_QUOTA_EXHAUSTED');
364
+ }
365
  if (!response.ok) {
366
  const errorText = await response.text().catch(() => 'Request failed');
367
  throw new Error(`Chat request failed: ${response.status} ${errorText}`);
frontend/src/store/agentStore.ts CHANGED
@@ -108,6 +108,8 @@ interface AgentStore {
108
  user: User | null;
109
  error: string | null;
110
  llmHealthError: LLMHealthError | null;
 
 
111
 
112
  // Right panel (single-artifact pattern)
113
  panelData: PanelData | null;
@@ -153,6 +155,7 @@ interface AgentStore {
153
  setUser: (user: User | null) => void;
154
  setError: (error: string | null) => void;
155
  setLlmHealthError: (error: LLMHealthError | null) => void;
 
156
 
157
  setPanel: (data: PanelData, view?: PanelView, editable?: boolean) => void;
158
  setPanelView: (view: PanelView) => void;
@@ -247,6 +250,7 @@ export const useAgentStore = create<AgentStore>()((set, get) => ({
247
  user: null,
248
  error: null,
249
  llmHealthError: null,
 
250
 
251
  panelData: null,
252
  panelView: 'script',
@@ -358,6 +362,7 @@ export const useAgentStore = create<AgentStore>()((set, get) => ({
358
  setUser: (user) => set({ user }),
359
  setError: (error) => set({ error }),
360
  setLlmHealthError: (error) => set({ llmHealthError: error }),
 
361
 
362
  // ── Panel (single-artifact) ───────────────────────────────────────
363
  // Each setter also patches the active session's snapshot so that
 
108
  user: User | null;
109
  error: string | null;
110
  llmHealthError: LLMHealthError | null;
111
+ /** Set when a Claude-send hits the daily quota — ChatInput opens the cap dialog in response. */
112
+ claudeQuotaExhausted: boolean;
113
 
114
  // Right panel (single-artifact pattern)
115
  panelData: PanelData | null;
 
155
  setUser: (user: User | null) => void;
156
  setError: (error: string | null) => void;
157
  setLlmHealthError: (error: LLMHealthError | null) => void;
158
+ setClaudeQuotaExhausted: (exhausted: boolean) => void;
159
 
160
  setPanel: (data: PanelData, view?: PanelView, editable?: boolean) => void;
161
  setPanelView: (view: PanelView) => void;
 
250
  user: null,
251
  error: null,
252
  llmHealthError: null,
253
+ claudeQuotaExhausted: false,
254
 
255
  panelData: null,
256
  panelView: 'script',
 
362
  setUser: (user) => set({ user }),
363
  setError: (error) => set({ error }),
364
  setLlmHealthError: (error) => set({ llmHealthError: error }),
365
+ setClaudeQuotaExhausted: (exhausted) => set({ claudeQuotaExhausted: exhausted }),
366
 
367
  // ── Panel (single-artifact) ───────────────────────────────────────
368
  // Each setter also patches the active session's snapshot so that
frontend/src/utils/model.ts ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Shared model-id constants used by session-create call sites and the
3
+ * ClaudeCapDialog "Use a free model" escape hatch.
4
+ *
5
+ * Keep in sync with MODEL_OPTIONS in components/Chat/ChatInput.tsx and
6
+ * AVAILABLE_MODELS in backend/routes/agent.py. Bare HF ids (no
7
+ * `huggingface/` prefix) — matches upstream's auto-router.
8
+ */
9
+
10
+ export const CLAUDE_MODEL_PATH = 'anthropic/claude-opus-4-6';
11
+ export const FIRST_FREE_MODEL_PATH = 'moonshotai/Kimi-K2.6';
12
+
13
+ export function isClaudePath(modelPath: string | undefined): boolean {
14
+ return !!modelPath && modelPath.startsWith('anthropic/');
15
+ }
tests/unit/test_user_quotas.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for backend/user_quotas.py — the in-memory Claude daily-quota store."""
2
+
3
+ import asyncio
4
+ import os
5
+ import sys
6
+ from pathlib import Path
7
+ from unittest.mock import patch
8
+
9
+ import pytest
10
+
11
+ # The backend package isn't on sys.path by default; add it so we can import
12
+ # the module under test without pulling in the whole FastAPI app.
13
+ _BACKEND_DIR = Path(__file__).resolve().parent.parent.parent / "backend"
14
+ if str(_BACKEND_DIR) not in sys.path:
15
+ sys.path.insert(0, str(_BACKEND_DIR))
16
+
17
+ import user_quotas # noqa: E402
18
+
19
+
20
+ @pytest.fixture(autouse=True)
21
+ def _reset_store():
22
+ """Fresh in-memory store per test."""
23
+ user_quotas._reset_for_tests()
24
+ yield
25
+ user_quotas._reset_for_tests()
26
+
27
+
28
+ def test_daily_cap_for_known_plans():
29
+ assert user_quotas.daily_cap_for("free") == user_quotas.CLAUDE_FREE_DAILY
30
+ assert user_quotas.daily_cap_for("pro") == user_quotas.CLAUDE_PRO_DAILY
31
+ assert user_quotas.daily_cap_for("org") == user_quotas.CLAUDE_PRO_DAILY
32
+
33
+
34
+ def test_daily_cap_for_unknown_or_missing_defaults_to_free():
35
+ assert user_quotas.daily_cap_for(None) == user_quotas.CLAUDE_FREE_DAILY
36
+ assert user_quotas.daily_cap_for("") == user_quotas.CLAUDE_FREE_DAILY
37
+ # Anything we don't recognize as the Pro/Org tier gets the Pro cap because
38
+ # the function's contract is "free" is the only downgraded tier. If that
39
+ # ever flips, this test will flip too — adjust consciously.
40
+ assert user_quotas.daily_cap_for("mystery") == user_quotas.CLAUDE_PRO_DAILY
41
+
42
+
43
+ @pytest.mark.asyncio
44
+ async def test_increment_and_read_back_same_day():
45
+ assert await user_quotas.get_claude_used_today("u1") == 0
46
+ assert await user_quotas.increment_claude("u1") == 1
47
+ assert await user_quotas.increment_claude("u1") == 2
48
+ assert await user_quotas.get_claude_used_today("u1") == 2
49
+
50
+
51
+ @pytest.mark.asyncio
52
+ async def test_independent_users_do_not_share_counts():
53
+ await user_quotas.increment_claude("alice")
54
+ await user_quotas.increment_claude("alice")
55
+ await user_quotas.increment_claude("bob")
56
+ assert await user_quotas.get_claude_used_today("alice") == 2
57
+ assert await user_quotas.get_claude_used_today("bob") == 1
58
+
59
+
60
+ @pytest.mark.asyncio
61
+ async def test_stale_day_resets_before_next_read():
62
+ await user_quotas.increment_claude("u1")
63
+ # Simulate yesterday's entry still in the store.
64
+ user_quotas._claude_counts["u1"] = ("2000-01-01", 99)
65
+ assert await user_quotas.get_claude_used_today("u1") == 0
66
+ # And a fresh increment starts from 0.
67
+ assert await user_quotas.increment_claude("u1") == 1
68
+
69
+
70
+ @pytest.mark.asyncio
71
+ async def test_concurrent_increments_under_lock_do_not_lose_writes():
72
+ """50 coroutines bumping the same user must land at exactly 50."""
73
+ await asyncio.gather(*[user_quotas.increment_claude("race") for _ in range(50)])
74
+ assert await user_quotas.get_claude_used_today("race") == 50
75
+
76
+
77
+ @pytest.mark.asyncio
78
+ async def test_refund_decrements_and_drops_entry_at_zero():
79
+ await user_quotas.increment_claude("u1")
80
+ assert await user_quotas.get_claude_used_today("u1") == 1
81
+ await user_quotas.refund_claude("u1")
82
+ assert await user_quotas.get_claude_used_today("u1") == 0
83
+ assert "u1" not in user_quotas._claude_counts
84
+
85
+
86
+ @pytest.mark.asyncio
87
+ async def test_refund_on_nonexistent_user_is_noop():
88
+ await user_quotas.refund_claude("ghost") # should not raise
89
+ assert await user_quotas.get_claude_used_today("ghost") == 0
90
+
91
+
92
+ @pytest.mark.asyncio
93
+ async def test_refund_on_stale_day_resets_rather_than_underflow():
94
+ user_quotas._claude_counts["u1"] = ("2000-01-01", 5)
95
+ await user_quotas.refund_claude("u1")
96
+ # Stale entry dropped; today's count stays 0.
97
+ assert await user_quotas.get_claude_used_today("u1") == 0
98
+
99
+
100
+ @pytest.mark.asyncio
101
+ async def test_free_user_cap_reached_at_one():
102
+ cap = user_quotas.daily_cap_for("free")
103
+ used = await user_quotas.increment_claude("freebie")
104
+ assert used == 1
105
+ assert used >= cap # first bump exhausts the free tier (cap=1)
106
+
107
+
108
+ @pytest.mark.asyncio
109
+ async def test_pro_user_cap_reached_at_twenty():
110
+ cap = user_quotas.daily_cap_for("pro")
111
+ assert cap == 20
112
+ for i in range(1, 21):
113
+ assert await user_quotas.increment_claude("pro_user") == i
114
+ # 21st would exceed — the gate in routes/agent.py enforces this; here
115
+ # we just confirm the counter tracks past the cap so that check works.
116
+ assert await user_quotas.increment_claude("pro_user") == 21