Spaces:

smolagents
/

ml-intern

Running on CPU Upgrade

App Files Files Community

akseljoonas HF Staff commited on 11 days ago

Commit

d0d08fc

2 Parent(s): 471f346 ff8c636

Deploy 2026-04-26

Browse files

Files changed (42) hide show

.github/workflows/claude-review.yml +13 -2
README.md +5 -2
agent/context_manager/manager.py +36 -32
agent/core/agent_loop.py +140 -6
agent/core/doom_loop.py +24 -4
agent/core/effort_probe.py +8 -4
agent/core/hf_access.py +181 -0
agent/core/llm_params.py +2 -2
agent/core/model_switcher.py +5 -2
agent/core/telemetry.py +38 -0
agent/main.py +19 -9
agent/tools/jobs_tool.py +20 -2
agent/tools/research_tool.py +3 -1
agent/tools/sandbox_tool.py +11 -1
agent/utils/terminal_display.py +2 -20
backend/dependencies.py +6 -50
backend/models.py +1 -0
backend/routes/agent.py +154 -2
backend/session_manager.py +1 -1
configs/cli_agent_config.json +14 -0
configs/{main_agent_config.json → frontend_agent_config.json} +0 -0
frontend/src/components/Chat/ChatInput.tsx +58 -4
frontend/src/components/ClaudeCapDialog.tsx +3 -0
frontend/src/components/JobsUpgradeDialog.tsx +191 -0
frontend/src/components/SessionChat.tsx +3 -1
frontend/src/hooks/useAgentChat.ts +130 -1
frontend/src/lib/sse-chat-transport.ts +26 -0
frontend/src/store/agentStore.ts +37 -0
frontend/src/types/agent.ts +1 -0
frontend/src/utils/model.ts +3 -4
pyproject.toml +6 -2
scripts/build_kpis.py +24 -2
tests/unit/test_build_kpis.py +32 -0
tests/unit/test_cli_rendering.py +44 -0
tests/unit/test_dangling_tool_calls.py +121 -0
tests/unit/test_doom_loop_polling.py +96 -0
tests/unit/test_hf_access.py +39 -0
tests/unit/test_llm_error_classification.py +100 -0
tests/unit/test_llm_params.py +25 -0
tests/unit/test_malformed_args_recovery.py +66 -0
tests/unit/test_sandbox_already_active_message.py +47 -0
uv.lock +114 -72

.github/workflows/claude-review.yml CHANGED Viewed

@@ -1,8 +1,8 @@
 name: Claude PR Review
 on:
-  pull_request:
-    types: [opened, synchronize, ready_for_review]
 permissions:
   contents: read
@@ -22,6 +22,10 @@ jobs:
       - uses: actions/checkout@v4
         with:
           fetch-depth: 0
       - name: Compose review prompt
         id: compose
@@ -58,5 +62,12 @@ jobs:
       - uses: anthropics/claude-code-action@v1
         with:
           anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
           track_progress: true
           prompt: ${{ steps.compose.outputs.prompt }}

 name: Claude PR Review
 on:
+  pull_request_target:
+    types: [opened, synchronize, ready_for_review, reopened]
 permissions:
   contents: read
       - uses: actions/checkout@v4
         with:
           fetch-depth: 0
+          # On pull_request_target, keep checkout on the trusted base-repo ref.
+          # The Claude action can review the PR via GitHub context/API without
+          # executing untrusted fork code with repository secrets.
+          persist-credentials: false
       - name: Compose review prompt
         id: compose
       - uses: anthropics/claude-code-action@v1
         with:
           anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
+          # Bypass the OIDC -> Claude GitHub App token exchange. That exchange
+          # rejects OIDC tokens minted for pull_request_target events with
+          # "401 Invalid OIDC token", which broke every review after the switch
+          # away from pull_request. Using the workflow's GITHUB_TOKEN works for
+          # both same-repo and fork PRs; comments post as github-actions[bot]
+          # instead of claude[bot], which is the documented trade-off.
+          github_token: ${{ secrets.GITHUB_TOKEN }}
           track_progress: true
           prompt: ${{ steps.compose.outputs.prompt }}

README.md CHANGED Viewed

@@ -23,7 +23,7 @@ hf_oauth_scopes:
 # ML Intern
-An ML intern that autonomously researches, writes, and ships good quality ML releated code using the Hugging Face ecosystem — with deep access to docs, papers, datasets, and cloud compute.
 ## Quick Start
@@ -46,6 +46,7 @@ Create a `.env` file in the project root (or export these in your shell):
 ```bash
 ANTHROPIC_API_KEY=<your-anthropic-api-key> # if using anthropic models
 HF_TOKEN=<your-hugging-face-token>
 GITHUB_TOKEN=<github-personal-access-token>
 ```
@@ -69,6 +70,7 @@ ml-intern "fine-tune llama on my dataset"
 ```bash
 ml-intern --model anthropic/claude-opus-4-6 "your prompt"
 ml-intern --max-iterations 100 "your prompt"
 ml-intern --no-stream "your prompt"
 ```
@@ -229,7 +231,8 @@ def create_builtin_tools() -> list[ToolSpec]:
 ### Adding MCP Servers
-Edit `configs/main_agent_config.json`:
 ```json
 {

 # ML Intern
+An ML intern that autonomously researches, writes, and ships good quality ML related code using the Hugging Face ecosystem — with deep access to docs, papers, datasets, and cloud compute.
 ## Quick Start
 ```bash
 ANTHROPIC_API_KEY=<your-anthropic-api-key> # if using anthropic models
+OPENAI_API_KEY=<your-openai-api-key> # if using openai models
 HF_TOKEN=<your-hugging-face-token>
 GITHUB_TOKEN=<github-personal-access-token>
 ```
 ```bash
 ml-intern --model anthropic/claude-opus-4-6 "your prompt"
+ml-intern --model openai/gpt-5.5 "your prompt"
 ml-intern --max-iterations 100 "your prompt"
 ml-intern --no-stream "your prompt"
 ```
 ### Adding MCP Servers
+Edit `configs/cli_agent_config.json` for CLI defaults, or
+`configs/frontend_agent_config.json` for web-session defaults:
 ```json
 {

agent/context_manager/manager.py CHANGED Viewed

@@ -253,45 +253,49 @@ class ContextManager:
     def _patch_dangling_tool_calls(self) -> None:
         """Add stub tool results for any tool_calls that lack a matching result.
-        Scans backwards to find the last assistant message with tool_calls,
-        which may not be items[-1] if some tool results were already added.
         """
         if not self.items:
             return
-        # Find the last assistant message with tool_calls
-        assistant_msg = None
-        for i in range(len(self.items) - 1, -1, -1):
             msg = self.items[i]
-            if getattr(msg, "role", None) == "assistant" and getattr(
-                msg, "tool_calls", None
-            ):
-                assistant_msg = msg
-                break
-            # Stop scanning once we hit a user message — anything before
-            # that belongs to a previous (complete) turn.
-            if getattr(msg, "role", None) == "user":
-                break
-        if not assistant_msg:
-            return
-        self._normalize_tool_calls(assistant_msg)
-        answered_ids = {
-            getattr(m, "tool_call_id", None)
-            for m in self.items
-            if getattr(m, "role", None) == "tool"
-        }
-        for tc in assistant_msg.tool_calls:
-            if tc.id not in answered_ids:
-                self.items.append(
-                    Message(
-                        role="tool",
-                        content="Tool was not executed (interrupted or error).",
-                        tool_call_id=tc.id,
-                        name=tc.function.name,
-                    )
-                )
     def undo_last_turn(self) -> bool:
         """Remove the last complete turn (user msg + all assistant/tool msgs that follow).

     def _patch_dangling_tool_calls(self) -> None:
         """Add stub tool results for any tool_calls that lack a matching result.
+        Ensures each assistant message's tool_calls are followed immediately
+        by matching tool-result messages. This has to work across the whole
+        history, not just the most recent turn, because a cancelled tool use
+        in an earlier turn can still poison the next provider request.
         """
         if not self.items:
             return
+        i = 0
+        while i < len(self.items):
             msg = self.items[i]
+            if getattr(msg, "role", None) != "assistant" or not getattr(msg, "tool_calls", None):
+                i += 1
+                continue
+            self._normalize_tool_calls(msg)
+            # Consume the contiguous tool-result block that immediately follows
+            # this assistant message. Any missing tool ids must be inserted
+            # before the next non-tool message to satisfy provider ordering.
+            j = i + 1
+            immediate_ids: set[str | None] = set()
+            while j < len(self.items) and getattr(self.items[j], "role", None) == "tool":
+                immediate_ids.add(getattr(self.items[j], "tool_call_id", None))
+                j += 1
+            missing: list[Message] = []
+            for tc in msg.tool_calls:
+                if tc.id not in immediate_ids:
+                    missing.append(
+                        Message(
+                            role="tool",
+                            content="Tool was not executed (interrupted or error).",
+                            tool_call_id=tc.id,
+                            name=tc.function.name,
+                        )
+                    )
+            if missing:
+                self.items[j:j] = missing
+                j += len(missing)
+            i = j
     def undo_last_turn(self) -> bool:
         """Remove the last complete turn (user msg + all assistant/tool msgs that follow).

agent/core/agent_loop.py CHANGED Viewed

@@ -25,6 +25,61 @@ logger = logging.getLogger(__name__)
 ToolCall = ChatCompletionMessageToolCall
 def _validate_tool_args(tool_args: dict) -> tuple[bool, str | None]:
     """
@@ -121,6 +176,54 @@ def _needs_approval(
 # -- LLM retry constants --------------------------------------------------
 _MAX_LLM_RETRIES = 3
 _LLM_RETRY_DELAYS = [5, 15, 30]  # seconds between retries
 def _is_transient_error(error: Exception) -> bool:
@@ -128,7 +231,6 @@ def _is_transient_error(error: Exception) -> bool:
     err_str = str(error).lower()
     transient_patterns = [
         "timeout", "timed out",
-        "429", "rate limit", "rate_limit",
         "503", "service unavailable",
         "502", "bad gateway",
         "500", "internal server error",
@@ -136,7 +238,7 @@ def _is_transient_error(error: Exception) -> bool:
         "connection reset", "connection refused", "connection error",
         "eof", "broken pipe",
     ]
-    return any(pattern in err_str for pattern in transient_patterns)
 def _is_effort_config_error(error: Exception) -> bool:
@@ -317,6 +419,8 @@ async def _call_llm_streaming(session: Session, messages, tools, llm_params) ->
         except ContextWindowExceededError:
             raise
         except Exception as e:
             if not _healed_effort and _is_effort_config_error(e):
                 _healed_effort = True
                 llm_params = await _heal_effort_and_rebuild_params(session, e, llm_params)
@@ -325,8 +429,8 @@ async def _call_llm_streaming(session: Session, messages, tools, llm_params) ->
                     data={"tool": "system", "log": "Reasoning effort not supported for this model — adjusting and retrying."},
                 ))
                 continue
-            if _llm_attempt < _MAX_LLM_RETRIES - 1 and _is_transient_error(e):
-                _delay = _LLM_RETRY_DELAYS[_llm_attempt]
                 logger.warning(
                     "Transient LLM error (attempt %d/%d): %s — retrying in %ds",
                     _llm_attempt + 1, _MAX_LLM_RETRIES, e, _delay,
@@ -424,6 +528,8 @@ async def _call_llm_non_streaming(session: Session, messages, tools, llm_params)
         except ContextWindowExceededError:
             raise
         except Exception as e:
             if not _healed_effort and _is_effort_config_error(e):
                 _healed_effort = True
                 llm_params = await _heal_effort_and_rebuild_params(session, e, llm_params)
@@ -432,8 +538,8 @@ async def _call_llm_non_streaming(session: Session, messages, tools, llm_params)
                     data={"tool": "system", "log": "Reasoning effort not supported for this model — adjusting and retrying."},
                 ))
                 continue
-            if _llm_attempt < _MAX_LLM_RETRIES - 1 and _is_transient_error(e):
-                _delay = _LLM_RETRY_DELAYS[_llm_attempt]
                 logger.warning(
                     "Transient LLM error (attempt %d/%d): %s — retrying in %ds",
                     _llm_attempt + 1, _MAX_LLM_RETRIES, e, _delay,
@@ -585,6 +691,31 @@ class Handlers:
                     )
                 )
             messages = session.context_manager.get_messages()
             tools = session.tool_router.get_tool_specs_for_llm()
             try:
@@ -1006,6 +1137,9 @@ class Handlers:
                     tool_args["script"] = edited_script
                     was_edited = True
                     logger.info(f"Using user-edited script for {tool_name} ({tc.id})")
                 approved_tasks.append((tc, tool_name, tool_args, was_edited))
             else:
                 rejected_tasks.append((tc, tool_name, approval_decision))

 ToolCall = ChatCompletionMessageToolCall
+_MALFORMED_TOOL_PREFIX = "ERROR: Tool call to '"
+_MALFORMED_TOOL_SUFFIX = "' had malformed JSON arguments"
+def _malformed_tool_name(message: Message) -> str | None:
+    """Return the tool name for malformed-json tool-result messages."""
+    if getattr(message, "role", None) != "tool":
+        return None
+    content = getattr(message, "content", None)
+    if not isinstance(content, str):
+        return None
+    if not content.startswith(_MALFORMED_TOOL_PREFIX):
+        return None
+    end = content.find(_MALFORMED_TOOL_SUFFIX, len(_MALFORMED_TOOL_PREFIX))
+    if end == -1:
+        return None
+    return content[len(_MALFORMED_TOOL_PREFIX):end]
+def _detect_repeated_malformed(
+    items: list[Message], threshold: int = 2,
+) -> str | None:
+    """Return the repeated malformed tool name if the tail contains a streak.
+    Walk backward over the current conversation tail. A streak counts only
+    consecutive malformed tool-result messages for the same tool; any other
+    tool result breaks it.
+    """
+    if threshold <= 0:
+        return None
+    streak_tool: str | None = None
+    streak = 0
+    for item in reversed(items):
+        if getattr(item, "role", None) != "tool":
+            continue
+        malformed_tool = _malformed_tool_name(item)
+        if malformed_tool is None:
+            break
+        if streak_tool is None:
+            streak_tool = malformed_tool
+            streak = 1
+        elif malformed_tool == streak_tool:
+            streak += 1
+        else:
+            break
+        if streak >= threshold:
+            return streak_tool
+    return None
 def _validate_tool_args(tool_args: dict) -> tuple[bool, str | None]:
     """
 # -- LLM retry constants --------------------------------------------------
 _MAX_LLM_RETRIES = 3
 _LLM_RETRY_DELAYS = [5, 15, 30]  # seconds between retries
+_LLM_RATE_LIMIT_RETRY_DELAYS = [30, 60]  # exceed Bedrock's ~60s TPM bucket window
+def _is_rate_limit_error(error: Exception) -> bool:
+    """Return True for rate-limit / quota-bucket style provider errors."""
+    err_str = str(error).lower()
+    rate_limit_patterns = [
+        "429",
+        "rate limit",
+        "rate_limit",
+        "too many requests",
+        "too many tokens",
+        "request limit",
+        "throttl",
+    ]
+    return any(pattern in err_str for pattern in rate_limit_patterns)
+def _is_context_overflow_error(error: Exception) -> bool:
+    """Return True when the prompt exceeded the model's context window."""
+    if isinstance(error, ContextWindowExceededError):
+        return True
+    err_str = str(error).lower()
+    overflow_patterns = [
+        "context window exceeded",
+        "maximum context length",
+        "max context length",
+        "prompt is too long",
+        "context length exceeded",
+        "too many input tokens",
+        "input is too long",
+    ]
+    return any(pattern in err_str for pattern in overflow_patterns)
+def _retry_delay_for(error: Exception, attempt_index: int) -> int | None:
+    """Return the delay for this retry attempt, or None if it should not retry."""
+    if _is_rate_limit_error(error):
+        schedule = _LLM_RATE_LIMIT_RETRY_DELAYS
+    elif _is_transient_error(error):
+        schedule = _LLM_RETRY_DELAYS
+    else:
+        return None
+    if attempt_index >= len(schedule):
+        return None
+    return schedule[attempt_index]
 def _is_transient_error(error: Exception) -> bool:
     err_str = str(error).lower()
     transient_patterns = [
         "timeout", "timed out",
         "503", "service unavailable",
         "502", "bad gateway",
         "500", "internal server error",
         "connection reset", "connection refused", "connection error",
         "eof", "broken pipe",
     ]
+    return _is_rate_limit_error(error) or any(pattern in err_str for pattern in transient_patterns)
 def _is_effort_config_error(error: Exception) -> bool:
         except ContextWindowExceededError:
             raise
         except Exception as e:
+            if _is_context_overflow_error(e):
+                raise ContextWindowExceededError(str(e)) from e
             if not _healed_effort and _is_effort_config_error(e):
                 _healed_effort = True
                 llm_params = await _heal_effort_and_rebuild_params(session, e, llm_params)
                     data={"tool": "system", "log": "Reasoning effort not supported for this model — adjusting and retrying."},
                 ))
                 continue
+            _delay = _retry_delay_for(e, _llm_attempt)
+            if _llm_attempt < _MAX_LLM_RETRIES - 1 and _delay is not None:
                 logger.warning(
                     "Transient LLM error (attempt %d/%d): %s — retrying in %ds",
                     _llm_attempt + 1, _MAX_LLM_RETRIES, e, _delay,
         except ContextWindowExceededError:
             raise
         except Exception as e:
+            if _is_context_overflow_error(e):
+                raise ContextWindowExceededError(str(e)) from e
             if not _healed_effort and _is_effort_config_error(e):
                 _healed_effort = True
                 llm_params = await _heal_effort_and_rebuild_params(session, e, llm_params)
                     data={"tool": "system", "log": "Reasoning effort not supported for this model — adjusting and retrying."},
                 ))
                 continue
+            _delay = _retry_delay_for(e, _llm_attempt)
+            if _llm_attempt < _MAX_LLM_RETRIES - 1 and _delay is not None:
                 logger.warning(
                     "Transient LLM error (attempt %d/%d): %s — retrying in %ds",
                     _llm_attempt + 1, _MAX_LLM_RETRIES, e, _delay,
                     )
                 )
+            malformed_tool = _detect_repeated_malformed(session.context_manager.items)
+            if malformed_tool:
+                recovery_prompt = (
+                    "[SYSTEM: Repeated malformed tool arguments detected for "
+                    f"'{malformed_tool}'. Stop retrying the same tool call shape. "
+                    "Use a different strategy that produces smaller, valid JSON. "
+                    "For large file writes, prefer bash with a heredoc or split the "
+                    "edit into multiple smaller tool calls.]"
+                )
+                session.context_manager.add_message(
+                    Message(role="user", content=recovery_prompt)
+                )
+                await session.send_event(
+                    Event(
+                        event_type="tool_log",
+                        data={
+                            "tool": "system",
+                            "log": (
+                                "Repeated malformed tool arguments detected — "
+                                f"forcing a different strategy for {malformed_tool}"
+                            ),
+                        },
+                    )
+                )
             messages = session.context_manager.get_messages()
             tools = session.tool_router.get_tool_specs_for_llm()
             try:
                     tool_args["script"] = edited_script
                     was_edited = True
                     logger.info(f"Using user-edited script for {tool_name} ({tc.id})")
+                selected_namespace = approval_decision.get("namespace")
+                if selected_namespace and tool_name == "hf_jobs":
+                    tool_args["namespace"] = selected_namespace
                 approved_tasks.append((tc, tool_name, tool_args, was_edited))
             else:
                 rejected_tasks.append((tc, tool_name, approval_decision))

agent/core/doom_loop.py CHANGED Viewed

@@ -17,10 +17,11 @@ logger = logging.getLogger(__name__)
 @dataclass(frozen=True)
 class ToolCallSignature:
-    """Hashable signature for a single tool call (name + args hash)."""
     name: str
     args_hash: str
 def _hash_args(args_str: str) -> str:
@@ -31,11 +32,16 @@ def _hash_args(args_str: str) -> str:
 def extract_recent_tool_signatures(
     messages: list[Message], lookback: int = 30
 ) -> list[ToolCallSignature]:
-    """Extract tool call signatures from recent assistant messages."""
     signatures: list[ToolCallSignature] = []
     recent = messages[-lookback:] if len(messages) > lookback else messages
-    for msg in recent:
         if getattr(msg, "role", None) != "assistant":
             continue
         tool_calls = getattr(msg, "tool_calls", None)
@@ -47,7 +53,21 @@ def extract_recent_tool_signatures(
                 continue
             name = getattr(fn, "name", "") or ""
             args_str = getattr(fn, "arguments", "") or ""
-            signatures.append(ToolCallSignature(name=name, args_hash=_hash_args(args_str)))
     return signatures

 @dataclass(frozen=True)
 class ToolCallSignature:
+    """Hashable signature for a single tool call plus its observed result."""
     name: str
     args_hash: str
+    result_hash: str | None = None
 def _hash_args(args_str: str) -> str:
 def extract_recent_tool_signatures(
     messages: list[Message], lookback: int = 30
 ) -> list[ToolCallSignature]:
+    """Extract tool call signatures from recent assistant messages.
+    Includes the immediate tool result hash when present. This prevents
+    legitimate polling from being classified as a doom loop when the poll
+    arguments stay constant but the observed result keeps changing.
+    """
     signatures: list[ToolCallSignature] = []
     recent = messages[-lookback:] if len(messages) > lookback else messages
+    for idx, msg in enumerate(recent):
         if getattr(msg, "role", None) != "assistant":
             continue
         tool_calls = getattr(msg, "tool_calls", None)
                 continue
             name = getattr(fn, "name", "") or ""
             args_str = getattr(fn, "arguments", "") or ""
+            result_hash = None
+            for follow in recent[idx + 1:]:
+                role = getattr(follow, "role", None)
+                if role == "tool" and getattr(follow, "tool_call_id", None) == getattr(tc, "id", None):
+                    result_hash = _hash_args(str(getattr(follow, "content", "") or ""))
+                    break
+                if role in {"assistant", "user"}:
+                    break
+            signatures.append(
+                ToolCallSignature(
+                    name=name,
+                    args_hash=_hash_args(args_str),
+                    result_hash=result_hash,
+                )
+            )
     return signatures

agent/core/effort_probe.py CHANGED Viewed

@@ -32,9 +32,10 @@ logger = logging.getLogger(__name__)
 # Cascade: for each user-stated preference, the ordered list of levels to
-# try. First success wins. ``max`` / ``xhigh`` are Anthropic-only; providers
-# that don't accept them raise ``UnsupportedEffortError`` synchronously (no
-# wasted network round-trip) and we advance to the next level.
 _EFFORT_CASCADE: dict[str, list[str]] = {
     "max":     ["max", "xhigh", "high", "medium", "low"],
     "xhigh":   ["xhigh", "high", "medium", "low"],
@@ -45,7 +46,10 @@ _EFFORT_CASCADE: dict[str, list[str]] = {
 }
 _PROBE_TIMEOUT = 15.0
-_PROBE_MAX_TOKENS = 16
 class ProbeInconclusive(Exception):

 # Cascade: for each user-stated preference, the ordered list of levels to
+# try. First success wins. ``max`` is Anthropic-only; ``xhigh`` is also
+# supported on current OpenAI GPT-5 models. Providers that don't accept a
+# requested level raise ``UnsupportedEffortError`` synchronously (no wasted
+# network round-trip) and we advance to the next level.
 _EFFORT_CASCADE: dict[str, list[str]] = {
     "max":     ["max", "xhigh", "high", "medium", "low"],
     "xhigh":   ["xhigh", "high", "medium", "low"],
 }
 _PROBE_TIMEOUT = 15.0
+# Keep the probe cheap, but high enough that frontier reasoning models can
+# finish a trivial reply instead of tripping a false "output limit reached"
+# error during capability detection.
+_PROBE_MAX_TOKENS = 64
 class ProbeInconclusive(Exception):

agent/core/hf_access.py ADDED Viewed

	@@ -0,0 +1,181 @@

+"""Helpers for Hugging Face account / org access decisions."""
+from __future__ import annotations
+import asyncio
+import os
+from dataclasses import dataclass
+from typing import Any
+import httpx
+OPENID_PROVIDER_URL = os.environ.get("OPENID_PROVIDER_URL", "https://huggingface.co")
+@dataclass(frozen=True)
+class JobsAccess:
+    """Jobs entitlement derived from whoami-v2."""
+    username: str | None
+    plan: str
+    personal_can_run_jobs: bool
+    paid_org_names: list[str]
+    eligible_namespaces: list[str]
+    default_namespace: str | None
+    access_known: bool = True
+    @property
+    def can_run_jobs(self) -> bool:
+        return bool(self.default_namespace)
+class JobsAccessError(Exception):
+    """Structured jobs access error for upgrade / namespace gating."""
+    def __init__(
+        self,
+        message: str,
+        *,
+        access: JobsAccess | None = None,
+        upgrade_required: bool = False,
+        namespace_required: bool = False,
+    ) -> None:
+        super().__init__(message)
+        self.access = access
+        self.upgrade_required = upgrade_required
+        self.namespace_required = namespace_required
+def _extract_username(whoami: dict[str, Any]) -> str | None:
+    for key in ("name", "user", "preferred_username"):
+        value = whoami.get(key)
+        if isinstance(value, str) and value:
+            return value
+    return None
+def _normalize_personal_plan(whoami: dict[str, Any]) -> str:
+    plan_str = ""
+    for key in ("plan", "type", "accountType"):
+        value = whoami.get(key)
+        if isinstance(value, str) and value:
+            plan_str = value.lower()
+            break
+    if not plan_str and (whoami.get("isPro") is True or whoami.get("is_pro") is True):
+        return "pro"
+    if any(tag in plan_str for tag in ("pro", "enterprise", "team")):
+        return "pro"
+    return "free"
+def _paid_org_names(whoami: dict[str, Any]) -> list[str]:
+    names: list[str] = []
+    orgs = whoami.get("orgs") or []
+    if not isinstance(orgs, list):
+        return names
+    for org in orgs:
+        if not isinstance(org, dict):
+            continue
+        name = org.get("name")
+        if not isinstance(name, str) or not name:
+            continue
+        org_plan = str(org.get("plan") or org.get("type") or "").lower()
+        if any(tag in org_plan for tag in ("pro", "enterprise", "team")):
+            names.append(name)
+    return sorted(set(names))
+def jobs_access_from_whoami(whoami: dict[str, Any]) -> JobsAccess:
+    username = _extract_username(whoami)
+    personal_plan = _normalize_personal_plan(whoami)
+    paid_orgs = _paid_org_names(whoami)
+    personal_can_run = personal_plan == "pro"
+    eligible_namespaces: list[str] = []
+    if personal_can_run and username:
+        eligible_namespaces.append(username)
+    eligible_namespaces.extend(paid_orgs)
+    plan = "pro" if personal_can_run else ("org" if paid_orgs else "free")
+    default_namespace = username if personal_can_run and username else None
+    return JobsAccess(
+        username=username,
+        plan=plan,
+        personal_can_run_jobs=personal_can_run,
+        paid_org_names=paid_orgs,
+        eligible_namespaces=eligible_namespaces,
+        default_namespace=default_namespace,
+    )
+async def fetch_whoami_v2(token: str, timeout: float = 5.0) -> dict[str, Any] | None:
+    if not token:
+        return None
+    async with httpx.AsyncClient(timeout=timeout) as client:
+        try:
+            response = await client.get(
+                f"{OPENID_PROVIDER_URL}/api/whoami-v2",
+                headers={"Authorization": f"Bearer {token}"},
+            )
+            if response.status_code != 200:
+                return None
+            payload = response.json()
+            return payload if isinstance(payload, dict) else None
+        except (httpx.HTTPError, ValueError):
+            return None
+async def get_jobs_access(token: str) -> JobsAccess | None:
+    whoami = await fetch_whoami_v2(token)
+    if whoami is None:
+        return None
+    return jobs_access_from_whoami(whoami)
+async def resolve_jobs_namespace(
+    token: str,
+    requested_namespace: str | None = None,
+) -> tuple[str, JobsAccess | None]:
+    """Return the namespace to use for jobs.
+    If whoami-v2 is unavailable, fall back to the token owner's username.
+    """
+    access = await get_jobs_access(token)
+    if access:
+        if requested_namespace:
+            if requested_namespace in access.eligible_namespaces:
+                return requested_namespace, access
+            raise JobsAccessError(
+                f"You can only run jobs under your own Pro account or a paid org you belong to. "
+                f"Allowed namespaces: {', '.join(access.eligible_namespaces) or '(none)'}",
+                access=access,
+            )
+        if access.default_namespace:
+            return access.default_namespace, access
+        if access.paid_org_names:
+            raise JobsAccessError(
+                "Choose which paid organization should own this job run.",
+                access=access,
+                namespace_required=True,
+            )
+        raise JobsAccessError(
+            "Hugging Face Jobs are available only to Pro users and Team or Enterprise organizations. "
+            "Upgrade to Pro, or run the job under a paid org you belong to.",
+            access=access,
+            upgrade_required=True,
+        )
+    # Fallback: whoami-v2 unavailable. Do not block the call pre-emptively.
+    from huggingface_hub import HfApi
+    username = None
+    if token:
+        whoami = await asyncio.to_thread(HfApi(token=token).whoami)
+        username = whoami.get("name")
+    if not username:
+        raise JobsAccessError("No HF token available to resolve a jobs namespace.")
+    return requested_namespace or username, None

agent/core/llm_params.py CHANGED Viewed

@@ -66,13 +66,13 @@ _patch_litellm_effort_validation()
 # Effort levels accepted on the wire.
 #   Anthropic (4.6+):  low | medium | high | xhigh | max   (output_config.effort)
-#   OpenAI direct:     minimal | low | medium | high       (reasoning_effort top-level)
 #   HF router:         low | medium | high                 (extra_body.reasoning_effort)
 #
 # We validate *shape* here and let the probe cascade walk down on rejection;
 # we deliberately do NOT maintain a per-model capability table.
 _ANTHROPIC_EFFORTS = {"low", "medium", "high", "xhigh", "max"}
-_OPENAI_EFFORTS = {"minimal", "low", "medium", "high"}
 _HF_EFFORTS = {"low", "medium", "high"}

 # Effort levels accepted on the wire.
 #   Anthropic (4.6+):  low | medium | high | xhigh | max   (output_config.effort)
+#   OpenAI direct:     minimal | low | medium | high | xhigh (reasoning_effort top-level)
 #   HF router:         low | medium | high                 (extra_body.reasoning_effort)
 #
 # We validate *shape* here and let the probe cascade walk down on rejection;
 # we deliberately do NOT maintain a per-model capability table.
 _ANTHROPIC_EFFORTS = {"low", "medium", "high", "xhigh", "max"}
+_OPENAI_EFFORTS = {"minimal", "low", "medium", "high", "xhigh"}
 _HF_EFFORTS = {"low", "medium", "high"}

agent/core/model_switcher.py CHANGED Viewed

@@ -24,8 +24,11 @@ from agent.core.effort_probe import ProbeInconclusive, probe_effort
 # ":cheapest" / ":preferred" / ":<provider>" to override the default
 # routing policy (auto = fastest with failover).
 SUGGESTED_MODELS = [
-    {"id": "bedrock/us.anthropic.claude-opus-4-7", "label": "Claude Opus 4.7"},
-    {"id": "bedrock/us.anthropic.claude-opus-4-6-v1", "label": "Claude Opus 4.6"},
     {"id": "MiniMaxAI/MiniMax-M2.7", "label": "MiniMax M2.7"},
     {"id": "moonshotai/Kimi-K2.6", "label": "Kimi K2.6"},
     {"id": "zai-org/GLM-5.1", "label": "GLM 5.1"},

 # ":cheapest" / ":preferred" / ":<provider>" to override the default
 # routing policy (auto = fastest with failover).
 SUGGESTED_MODELS = [
+    {"id": "openai/gpt-5.5", "label": "GPT-5.5"},
+    {"id": "openai/gpt-5.4", "label": "GPT-5.4"},
+    {"id": "anthropic/claude-opus-4-7", "label": "Claude Opus 4.7"},
+    {"id": "anthropic/claude-opus-4-6", "label": "Claude Opus 4.6"},
+    {"id": "bedrock/us.anthropic.claude-opus-4-6-v1", "label": "Claude Opus 4.6 via Bedrock"},
     {"id": "MiniMaxAI/MiniMax-M2.7", "label": "MiniMax M2.7"},
     {"id": "moonshotai/Kimi-K2.6", "label": "Kimi K2.6"},
     {"id": "zai-org/GLM-5.1", "label": "GLM 5.1"},

agent/core/telemetry.py CHANGED Viewed

@@ -141,6 +141,7 @@ async def record_hf_job_submit(
                 "timeout": args.get("timeout", "30m"),
                 "job_type": job_type,
                 "image": image,
                 "push_to_hub": _infer_push_to_hub(script_text),
             },
         ))
@@ -239,6 +240,43 @@ async def record_feedback(
         logger.debug("record_feedback failed (non-fatal): %s", e)
 # ── heartbeat ──────────────────────────────────────────────────────────────
 # Module-level reference set for fire-and-forget heartbeat tasks. asyncio only

                 "timeout": args.get("timeout", "30m"),
                 "job_type": job_type,
                 "image": image,
+                "namespace": args.get("namespace"),
                 "push_to_hub": _infer_push_to_hub(script_text),
             },
         ))
         logger.debug("record_feedback failed (non-fatal): %s", e)
+async def record_jobs_access_blocked(
+    session: Any,
+    *,
+    tool_call_ids: list[str],
+    plan: str,
+    eligible_namespaces: list[str],
+) -> None:
+    from agent.core.session import Event
+    try:
+        await session.send_event(Event(
+            event_type="jobs_access_blocked",
+            data={
+                "tool_call_ids": tool_call_ids,
+                "plan": plan,
+                "eligible_namespaces": eligible_namespaces,
+            },
+        ))
+    except Exception as e:
+        logger.debug("record_jobs_access_blocked failed (non-fatal): %s", e)
+async def record_pro_cta_click(
+    session: Any,
+    *,
+    source: str,
+    target: str = "pro_pricing",
+) -> None:
+    from agent.core.session import Event
+    try:
+        await session.send_event(Event(
+            event_type="pro_cta_click",
+            data={"source": source, "target": target},
+        ))
+    except Exception as e:
+        logger.debug("record_pro_cta_click failed (non-fatal): %s", e)
 # ── heartbeat ──────────────────────────────────────────────────────────────
 # Module-level reference set for fire-and-forget heartbeat tasks. asyncio only

agent/main.py CHANGED Viewed

@@ -50,6 +50,16 @@ litellm.drop_params = True
 # on every error — users don't need it, and our friendly errors cover the case.
 litellm.suppress_debug_info = True
 def _safe_get_args(arguments: dict) -> dict:
     """Safely extract args dict from arguments, handling cases where LLM passes string."""
     args = arguments.get("args", {})
@@ -771,8 +781,9 @@ async def _handle_slash_command(
                     console.print(f"  [dim]{m}: {eff or 'off'}[/dim]")
             console.print(
                 "[dim]Set with '/effort minimal|low|medium|high|xhigh|max|off'. "
-                "'max' and 'xhigh' are Anthropic-only; the cascade falls back "
-                "to whatever the model actually accepts.[/dim]"
             )
             return None
         level = arg.lower()
@@ -820,6 +831,8 @@ async def main():
     if not hf_token:
         hf_token = await _prompt_and_save_hf_token(prompt_session)
     # Resolve username for banner
     hf_user = None
     try:
@@ -828,7 +841,7 @@ async def main():
     except Exception:
         pass
-    print_banner(hf_user=hf_user)
     # Pre-warm the HF router catalog in the background so /model switches
     # don't block on a network fetch.
@@ -844,10 +857,6 @@ async def main():
     turn_complete_event.set()
     ready_event = asyncio.Event()
-    # Start agent loop in background
-    config_path = Path(__file__).parent.parent / "configs" / "main_agent_config.json"
-    config = load_config(config_path)
     # Create tool router with local mode
     tool_router = ToolRouter(config.mcpServers, hf_token=hf_token, local_mode=True)
@@ -1036,6 +1045,7 @@ async def headless_main(
     import logging
     logging.basicConfig(level=logging.WARNING)
     hf_token = _get_hf_token()
     if not hf_token:
@@ -1044,8 +1054,7 @@ async def headless_main(
     print(f"HF token loaded", file=sys.stderr)
-    config_path = Path(__file__).parent.parent / "configs" / "main_agent_config.json"
-    config = load_config(config_path)
     config.yolo_mode = True  # Auto-approve everything in headless mode
     if model:
@@ -1221,6 +1230,7 @@ def cli():
     import warnings
     # Suppress aiohttp "Unclosed client session" noise during event loop teardown
     _logging.getLogger("asyncio").setLevel(_logging.CRITICAL)
     # Suppress litellm pydantic deprecation warnings
     warnings.filterwarnings("ignore", category=DeprecationWarning, module="litellm")
     # Suppress whoosh invalid escape sequence warnings (third-party, unfixed upstream)

 # on every error — users don't need it, and our friendly errors cover the case.
 litellm.suppress_debug_info = True
+CLI_CONFIG_PATH = Path(__file__).parent.parent / "configs" / "cli_agent_config.json"
+def _configure_runtime_logging() -> None:
+    """Keep third-party warning spam from punching through the interactive UI."""
+    import logging
+    logging.getLogger("LiteLLM").setLevel(logging.ERROR)
+    logging.getLogger("litellm").setLevel(logging.ERROR)
 def _safe_get_args(arguments: dict) -> dict:
     """Safely extract args dict from arguments, handling cases where LLM passes string."""
     args = arguments.get("args", {})
                     console.print(f"  [dim]{m}: {eff or 'off'}[/dim]")
             console.print(
                 "[dim]Set with '/effort minimal|low|medium|high|xhigh|max|off'. "
+                "'max' is Anthropic-only; 'xhigh' is also supported by current "
+                "OpenAI GPT-5 models. The cascade falls back to whatever the "
+                "model actually accepts.[/dim]"
             )
             return None
         level = arg.lower()
     if not hf_token:
         hf_token = await _prompt_and_save_hf_token(prompt_session)
+    config = load_config(CLI_CONFIG_PATH)
     # Resolve username for banner
     hf_user = None
     try:
     except Exception:
         pass
+    print_banner(model=config.model_name, hf_user=hf_user)
     # Pre-warm the HF router catalog in the background so /model switches
     # don't block on a network fetch.
     turn_complete_event.set()
     ready_event = asyncio.Event()
     # Create tool router with local mode
     tool_router = ToolRouter(config.mcpServers, hf_token=hf_token, local_mode=True)
     import logging
     logging.basicConfig(level=logging.WARNING)
+    _configure_runtime_logging()
     hf_token = _get_hf_token()
     if not hf_token:
     print(f"HF token loaded", file=sys.stderr)
+    config = load_config(CLI_CONFIG_PATH)
     config.yolo_mode = True  # Auto-approve everything in headless mode
     if model:
     import warnings
     # Suppress aiohttp "Unclosed client session" noise during event loop teardown
     _logging.getLogger("asyncio").setLevel(_logging.CRITICAL)
+    _configure_runtime_logging()
     # Suppress litellm pydantic deprecation warnings
     warnings.filterwarnings("ignore", category=DeprecationWarning, module="litellm")
     # Suppress whoosh invalid escape sequence warnings (third-party, unfixed upstream)

agent/tools/jobs_tool.py CHANGED Viewed

@@ -17,6 +17,7 @@ import httpx
 from huggingface_hub import HfApi
 from huggingface_hub.utils import HfHubHTTPError
 from agent.core.session import Event
 from agent.tools.types import ToolResult
@@ -298,6 +299,7 @@ class HfJobsTool:
         self,
         hf_token: Optional[str] = None,
         namespace: Optional[str] = None,
         log_callback: Optional[Callable[[str], Awaitable[None]]] = None,
         session: Any = None,
         tool_call_id: Optional[str] = None,
@@ -305,6 +307,7 @@ class HfJobsTool:
         self.hf_token = hf_token
         self.api = HfApi(token=hf_token)
         self.namespace = namespace
         self.log_callback = log_callback
         self.session = session
         self.tool_call_id = tool_call_id
@@ -565,7 +568,7 @@ class HfJobsTool:
                 from agent.core import telemetry
                 submit_ts = await telemetry.record_hf_job_submit(
                     self.session, job,
-                    {**args, "hardware_flavor": flavor, "timeout": timeout_str},
                     image=image, job_type=job_type,
                 )
@@ -1057,6 +1060,14 @@ HF_JOBS_TOOL_SPEC = {
                 "type": "object",
                 "description": "Environment variables {'KEY': 'VALUE'}. HF_TOKEN is auto-included.",
             },
             "job_id": {
                 "type": "string",
                 "description": "Job ID. Required for: logs, inspect, cancel.",
@@ -1099,11 +1110,18 @@ async def hf_jobs_handler(
                 arguments = {**arguments, "script": content}
         hf_token = session.hf_token if session else None
-        namespace = os.environ.get("HF_NAMESPACE") or (HfApi(token=hf_token).whoami().get("name") if hf_token else None)
         tool = HfJobsTool(
             namespace=namespace,
             hf_token=hf_token,
             log_callback=log_callback if session else None,
             session=session,
             tool_call_id=tool_call_id,

 from huggingface_hub import HfApi
 from huggingface_hub.utils import HfHubHTTPError
+from agent.core.hf_access import JobsAccessError, resolve_jobs_namespace
 from agent.core.session import Event
 from agent.tools.types import ToolResult
         self,
         hf_token: Optional[str] = None,
         namespace: Optional[str] = None,
+        jobs_access: Any = None,
         log_callback: Optional[Callable[[str], Awaitable[None]]] = None,
         session: Any = None,
         tool_call_id: Optional[str] = None,
         self.hf_token = hf_token
         self.api = HfApi(token=hf_token)
         self.namespace = namespace
+        self.jobs_access = jobs_access
         self.log_callback = log_callback
         self.session = session
         self.tool_call_id = tool_call_id
                 from agent.core import telemetry
                 submit_ts = await telemetry.record_hf_job_submit(
                     self.session, job,
+                    {**args, "hardware_flavor": flavor, "timeout": timeout_str, "namespace": self.namespace},
                     image=image, job_type=job_type,
                 )
                 "type": "object",
                 "description": "Environment variables {'KEY': 'VALUE'}. HF_TOKEN is auto-included.",
             },
+            "namespace": {
+                "type": "string",
+                "description": (
+                    "Optional namespace to run the job under. Must be your own Pro account "
+                    "or a paid org you belong to. If omitted, the tool prefers your personal "
+                    "account when eligible, otherwise the first eligible paid org."
+                ),
+            },
             "job_id": {
                 "type": "string",
                 "description": "Job ID. Required for: logs, inspect, cancel.",
                 arguments = {**arguments, "script": content}
         hf_token = session.hf_token if session else None
+        try:
+            namespace, jobs_access = await resolve_jobs_namespace(
+                hf_token or "",
+                arguments.get("namespace"),
+            )
+        except JobsAccessError as e:
+            return str(e), False
         tool = HfJobsTool(
             namespace=namespace,
             hf_token=hf_token,
+            jobs_access=jobs_access,
             log_callback=log_callback if session else None,
             session=session,
             tool_call_id=tool_call_id,

agent/tools/research_tool.py CHANGED Viewed

@@ -216,7 +216,9 @@ RESEARCH_TOOL_SPEC = {
 def _get_research_model(main_model: str) -> str:
     """Pick a cheaper model for research based on the main model."""
-    if "anthropic" in main_model:
         return "bedrock/us.anthropic.claude-sonnet-4-6"
     # For non-Anthropic models (HF router etc.), use the same model
     return main_model

 def _get_research_model(main_model: str) -> str:
     """Pick a cheaper model for research based on the main model."""
+    if main_model.startswith("anthropic/"):
+        return "anthropic/claude-sonnet-4-6"
+    if main_model.startswith("bedrock/") and "anthropic" in main_model:
         return "bedrock/us.anthropic.claude-sonnet-4-6"
     # For non-Anthropic models (HF router etc.), use the same model
     return main_model

agent/tools/sandbox_tool.py CHANGED Viewed

@@ -213,16 +213,26 @@ async def sandbox_create_handler(
     args: dict[str, Any], session: Any = None
 ) -> tuple[str, bool]:
     """Handle sandbox_create tool calls."""
     # If sandbox already exists, return its info
     if session and getattr(session, "sandbox", None):
         sb = session.sandbox
         return (
             f"Sandbox already active: {sb.space_id}\n"
             f"URL: {sb.url}\n"
             f"Use bash/read/write/edit to interact with it."
         ), True
-    hardware = args.get("hardware", "cpu-basic")
     create_kwargs = {}
     if "private" in args:
         create_kwargs["private"] = args["private"]

     args: dict[str, Any], session: Any = None
 ) -> tuple[str, bool]:
     """Handle sandbox_create tool calls."""
+    hardware = args.get("hardware", "cpu-basic")
     # If sandbox already exists, return its info
     if session and getattr(session, "sandbox", None):
         sb = session.sandbox
+        requested_hardware = args.get("hardware")
+        lockout_note = ""
+        if requested_hardware:
+            lockout_note = (
+                f"\nRequested hardware: {requested_hardware}\n"
+                "Hardware cannot be changed by calling sandbox_create again. "
+                "Delete the existing sandbox first if you need a different tier."
+            )
         return (
             f"Sandbox already active: {sb.space_id}\n"
             f"URL: {sb.url}\n"
+            f"{lockout_note}\n"
             f"Use bash/read/write/edit to interact with it."
         ), True
     create_kwargs = {}
     if "private" in args:
         create_kwargs["private"] = args["private"]

agent/utils/terminal_display.py CHANGED Viewed

@@ -99,7 +99,7 @@ def print_banner(model: str | None = None, hf_user: str | None = None) -> None:
     _console.file.write("\033[2J\033[H")
     _console.file.flush()
-    model_label = model or "bedrock/us.anthropic.claude-opus-4-6-v1"
     user_label = hf_user or "not logged in"
     # Warm gold palette matching the shimmer highlight (255, 200, 80)
@@ -180,10 +180,8 @@ class SubAgentDisplayManager:
     def __init__(self):
         self._agents: dict[str, dict] = {}  # agent_id -> state dict
         self._lines_on_screen = 0
-        self._ticker_task = None
     def start(self, agent_id: str, label: str = "research") -> None:
-        import asyncio
         import time
         self._agents[agent_id] = {
             "label": label,
@@ -192,8 +190,6 @@ class SubAgentDisplayManager:
             "token_count": 0,
             "start_time": time.monotonic(),
         }
-        if not self._ticker_task:
-            self._ticker_task = asyncio.ensure_future(self._tick())
         self._redraw()
     def set_tokens(self, agent_id: str, tokens: int) -> None:
@@ -222,11 +218,7 @@ class SubAgentDisplayManager:
             _console.file.write(line + "\n")
             _console.file.flush()
         self._lines_on_screen = 0
-        if not self._agents:
-            if self._ticker_task:
-                self._ticker_task.cancel()
-                self._ticker_task = None
-        else:
             self._redraw()
     @staticmethod
@@ -239,16 +231,6 @@ class SubAgentDisplayManager:
             line += f"  \033[2m({stats})\033[0m"
         return line
-    async def _tick(self) -> None:
-        import asyncio
-        try:
-            while True:
-                await asyncio.sleep(1.0)
-                if self._agents:
-                    self._redraw()
-        except asyncio.CancelledError:
-            pass
     @staticmethod
     def _format_stats(agent: dict) -> str:
         import time

     _console.file.write("\033[2J\033[H")
     _console.file.flush()
+    model_label = model or "unknown"
     user_label = hf_user or "not logged in"
     # Warm gold palette matching the shimmer highlight (255, 200, 80)
     def __init__(self):
         self._agents: dict[str, dict] = {}  # agent_id -> state dict
         self._lines_on_screen = 0
     def start(self, agent_id: str, label: str = "research") -> None:
         import time
         self._agents[agent_id] = {
             "label": label,
             "token_count": 0,
             "start_time": time.monotonic(),
         }
         self._redraw()
     def set_tokens(self, agent_id: str, tokens: int) -> None:
             _console.file.write(line + "\n")
             _console.file.flush()
         self._lines_on_screen = 0
+        if self._agents:
             self._redraw()
     @staticmethod
             line += f"  \033[2m({stats})\033[0m"
         return line
     @staticmethod
     def _format_stats(agent: dict) -> str:
         import time

backend/dependencies.py CHANGED Viewed

@@ -12,6 +12,8 @@ from typing import Any
 import httpx
 from fastapi import HTTPException, Request, status
 logger = logging.getLogger(__name__)
 OPENID_PROVIDER_URL = os.environ.get("OPENID_PROVIDER_URL", "https://huggingface.co")
@@ -80,41 +82,6 @@ def _user_from_info(user_info: dict[str, Any]) -> dict[str, Any]:
     }
-def _normalize_plan(whoami: dict[str, Any]) -> str:
-    """Map an HF /api/whoami-v2 payload to one of: 'free' | 'pro' | 'org'.
-    The exact field shape in whoami-v2 isn't documented for our purposes,
-    so we try a handful of likely keys and fall back to 'free'. The first
-    call logs the raw shape at DEBUG (see `_fetch_user_plan`) so we can
-    pin the real key post-deploy.
-    """
-    plan_str = ""
-    for key in ("plan", "type", "accountType"):
-        val = whoami.get(key)
-        if isinstance(val, str) and val:
-            plan_str = val.lower()
-            break
-    if not plan_str:
-        if whoami.get("isPro") is True or whoami.get("is_pro") is True:
-            return "pro"
-    if "pro" in plan_str or "enterprise" in plan_str or "team" in plan_str:
-        return "pro"
-    # Org tier: anyone in a paid / enterprise org. We don't pay for this
-    # right now, but the "pro" cap applies identically.
-    orgs = whoami.get("orgs") or []
-    if isinstance(orgs, list):
-        for org in orgs:
-            if isinstance(org, dict):
-                org_plan = str(org.get("plan") or org.get("type") or "").lower()
-                if "pro" in org_plan or "enterprise" in org_plan or "team" in org_plan:
-                    return "org"
-    return "free"
 async def _fetch_user_plan(token: str) -> str:
     """Look up the user's HF plan via /api/whoami-v2.
@@ -123,19 +90,9 @@ async def _fetch_user_plan(token: str) -> str:
     grant the Pro cap than over-grant it on bad data.
     """
     global _WHOAMI_SHAPE_LOGGED
-    async with httpx.AsyncClient(timeout=5.0) as client:
-        try:
-            resp = await client.get(
-                f"{OPENID_PROVIDER_URL}/api/whoami-v2",
-                headers={"Authorization": f"Bearer {token}"},
-            )
-            if resp.status_code != 200:
-                return "free"
-            whoami = resp.json()
-        except httpx.HTTPError:
-            return "free"
-        except ValueError:
-            return "free"
     if not _WHOAMI_SHAPE_LOGGED:
         _WHOAMI_SHAPE_LOGGED = True
@@ -149,7 +106,7 @@ async def _fetch_user_plan(token: str) -> str:
     if not isinstance(whoami, dict):
         return "free"
-    return _normalize_plan(whoami)
 async def _extract_user_from_token(token: str) -> dict[str, Any] | None:
@@ -246,4 +203,3 @@ async def require_huggingface_org_member(request: Request) -> bool:
         return False
     return await check_org_membership(token, HF_EMPLOYEE_ORG)

 import httpx
 from fastapi import HTTPException, Request, status
+from agent.core.hf_access import fetch_whoami_v2, jobs_access_from_whoami
 logger = logging.getLogger(__name__)
 OPENID_PROVIDER_URL = os.environ.get("OPENID_PROVIDER_URL", "https://huggingface.co")
     }
 async def _fetch_user_plan(token: str) -> str:
     """Look up the user's HF plan via /api/whoami-v2.
     grant the Pro cap than over-grant it on bad data.
     """
     global _WHOAMI_SHAPE_LOGGED
+    whoami = await fetch_whoami_v2(token)
+    if whoami is None:
+        return "free"
     if not _WHOAMI_SHAPE_LOGGED:
         _WHOAMI_SHAPE_LOGGED = True
     if not isinstance(whoami, dict):
         return "free"
+    return jobs_access_from_whoami(whoami).plan
 async def _extract_user_from_token(token: str) -> dict[str, Any] | None:
         return False
     return await check_org_membership(token, HF_EMPLOYEE_ORG)

backend/models.py CHANGED Viewed

@@ -38,6 +38,7 @@ class ToolApproval(BaseModel):
     approved: bool
     feedback: str | None = None
     edited_script: str | None = None
 class ApprovalRequest(BaseModel):

     approved: bool
     feedback: str | None = None
     edited_script: str | None = None
+    namespace: str | None = None
 class ApprovalRequest(BaseModel):

backend/routes/agent.py CHANGED Viewed

@@ -32,6 +32,7 @@ from session_manager import MAX_SESSIONS, AgentSession, SessionCapacityError, se
 import user_quotas
 from agent.core.llm_params import _resolve_llm_params
 logger = logging.getLogger(__name__)
@@ -136,6 +137,105 @@ async def _enforce_claude_quota(
     agent_session.claude_counted = True
 def _check_session_access(session_id: str, user: dict[str, Any]) -> None:
     """Verify the user has access to the given session. Raises 403 or 404."""
     info = session_manager.get_session_info(session_id)
@@ -442,6 +542,27 @@ async def get_user_quota(user: dict = Depends(get_current_user)) -> dict:
     }
 @router.get("/sessions", response_model=list[SessionInfo])
 async def list_sessions(user: dict = Depends(get_current_user)) -> list[SessionInfo]:
     """List sessions belonging to the authenticated user."""
@@ -482,15 +603,20 @@ async def submit_approval(
 ) -> dict:
     """Submit tool approvals to a session. Only accessible by the session owner."""
     _check_session_access(request.session_id, user)
     approvals = [
         {
             "tool_call_id": a.tool_call_id,
             "approved": a.approved,
             "feedback": a.feedback,
             "edited_script": a.edited_script,
         }
         for a in request.approvals
     ]
     success = await session_manager.submit_approval(request.session_id, approvals)
     if not success:
         raise HTTPException(status_code=404, detail="Session not found or inactive")
@@ -540,9 +666,11 @@ async def chat_sse(
                     "approved": a["approved"],
                     "feedback": a.get("feedback"),
                     "edited_script": a.get("edited_script"),
                 }
                 for a in approvals
             ]
             success = await session_manager.submit_approval(session_id, formatted)
         elif text is not None:
             success = await session_manager.submit_user_input(session_id, text)
@@ -554,6 +682,7 @@ async def chat_sse(
             broadcaster.unsubscribe(sub_id)
             raise HTTPException(status_code=404, detail="Session not found or inactive")
     except HTTPException:
         raise
     except Exception:
         broadcaster.unsubscribe(sub_id)
@@ -562,6 +691,31 @@ async def chat_sse(
     return _sse_response(broadcaster, event_queue, sub_id)
 # ---------------------------------------------------------------------------
 # Shared SSE helpers
 # ---------------------------------------------------------------------------
@@ -729,5 +883,3 @@ async def submit_feedback(
             agent_session.session.config.session_dataset_repo
         )
     return {"status": "ok"}

 import user_quotas
+from agent.core.hf_access import get_jobs_access
 from agent.core.llm_params import _resolve_llm_params
 logger = logging.getLogger(__name__)
     agent_session.claude_counted = True
+async def _enforce_jobs_access_for_approvals(
+    user: dict[str, Any],
+    agent_session: AgentSession,
+    approvals: list[dict[str, Any]],
+) -> None:
+    """Block approved hf_jobs tool calls when the user has no eligible jobs namespace."""
+    pending = agent_session.session.pending_approval or {}
+    tool_calls = pending.get("tool_calls") or []
+    if not tool_calls:
+        return
+    approved_ids = {
+        a.get("tool_call_id")
+        for a in approvals
+        if a.get("approved")
+    }
+    if not approved_ids:
+        return
+    hf_job_ids = [
+        tc.id for tc in tool_calls
+        if tc.id in approved_ids and tc.function.name == "hf_jobs"
+    ]
+    if not hf_job_ids:
+        return
+    token = agent_session.hf_token or agent_session.session.hf_token
+    if not token:
+        return
+    access = await get_jobs_access(token)
+    if access is None:
+        return
+    approval_map = {a.get("tool_call_id"): a for a in approvals}
+    if access.personal_can_run_jobs:
+        return
+    if access.paid_org_names:
+        invalid_namespace = [
+            tool_call_id
+            for tool_call_id in hf_job_ids
+            if (
+                approval_map.get(tool_call_id, {}).get("namespace")
+                and approval_map.get(tool_call_id, {}).get("namespace") not in access.paid_org_names
+            )
+        ]
+        if invalid_namespace:
+            raise HTTPException(
+                status_code=400,
+                detail={
+                    "error": "hf_jobs_invalid_namespace",
+                    "message": (
+                        "The selected jobs namespace is not one of your eligible paid organizations. "
+                        f"Allowed namespaces: {', '.join(access.paid_org_names)}"
+                    ),
+                },
+            )
+        missing_namespace = [
+            tool_call_id
+            for tool_call_id in hf_job_ids
+            if not approval_map.get(tool_call_id, {}).get("namespace")
+        ]
+        if missing_namespace:
+            raise HTTPException(
+                status_code=409,
+                detail={
+                    "error": "hf_jobs_namespace_required",
+                    "message": "Choose which paid organization should own this job run.",
+                    "plan": user.get("plan", "free"),
+                    "tool_call_ids": missing_namespace,
+                    "eligible_namespaces": access.paid_org_names,
+                },
+            )
+        return
+    from agent.core import telemetry
+    await telemetry.record_jobs_access_blocked(
+        agent_session.session,
+        tool_call_ids=hf_job_ids,
+        plan=user.get("plan", "free"),
+        eligible_namespaces=access.eligible_namespaces,
+    )
+    raise HTTPException(
+        status_code=402,
+        detail={
+            "error": "hf_jobs_upgrade_required",
+            "message": (
+                "Hugging Face Jobs are available only to Pro users and Team or Enterprise organizations. "
+                "Upgrade to Pro, or decline the job tool call so the agent can choose another path."
+            ),
+            "plan": user.get("plan", "free"),
+            "tool_call_ids": hf_job_ids,
+            "eligible_namespaces": access.eligible_namespaces,
+        },
+    )
 def _check_session_access(session_id: str, user: dict[str, Any]) -> None:
     """Verify the user has access to the given session. Raises 403 or 404."""
     info = session_manager.get_session_info(session_id)
     }
+@router.get("/user/jobs-access")
+async def get_jobs_access_info(request: Request, user: dict = Depends(get_current_user)) -> dict:
+    """Return whether the current token can run HF Jobs and under which namespaces."""
+    token = None
+    auth_header = request.headers.get("Authorization", "")
+    if auth_header.startswith("Bearer "):
+        token = auth_header[7:]
+    if not token:
+        token = request.cookies.get("hf_access_token")
+    if not token:
+        token = os.environ.get("HF_TOKEN")
+    access = await get_jobs_access(token or "")
+    return {
+        "plan": user.get("plan", "free"),
+        "can_run_jobs": bool(access and (access.personal_can_run_jobs or access.paid_org_names)),
+        "eligible_namespaces": access.eligible_namespaces if access else [],
+        "default_namespace": access.default_namespace if access else None,
+    }
 @router.get("/sessions", response_model=list[SessionInfo])
 async def list_sessions(user: dict = Depends(get_current_user)) -> list[SessionInfo]:
     """List sessions belonging to the authenticated user."""
 ) -> dict:
     """Submit tool approvals to a session. Only accessible by the session owner."""
     _check_session_access(request.session_id, user)
+    agent_session = session_manager.sessions.get(request.session_id)
+    if agent_session is None:
+        raise HTTPException(status_code=404, detail="Session not found or inactive")
     approvals = [
         {
             "tool_call_id": a.tool_call_id,
             "approved": a.approved,
             "feedback": a.feedback,
             "edited_script": a.edited_script,
+            "namespace": a.namespace,
         }
         for a in request.approvals
     ]
+    await _enforce_jobs_access_for_approvals(user, agent_session, approvals)
     success = await session_manager.submit_approval(request.session_id, approvals)
     if not success:
         raise HTTPException(status_code=404, detail="Session not found or inactive")
                     "approved": a["approved"],
                     "feedback": a.get("feedback"),
                     "edited_script": a.get("edited_script"),
+                    "namespace": a.get("namespace"),
                 }
                 for a in approvals
             ]
+            await _enforce_jobs_access_for_approvals(user, agent_session, formatted)
             success = await session_manager.submit_approval(session_id, formatted)
         elif text is not None:
             success = await session_manager.submit_user_input(session_id, text)
             broadcaster.unsubscribe(sub_id)
             raise HTTPException(status_code=404, detail="Session not found or inactive")
     except HTTPException:
+        broadcaster.unsubscribe(sub_id)
         raise
     except Exception:
         broadcaster.unsubscribe(sub_id)
     return _sse_response(broadcaster, event_queue, sub_id)
+@router.post("/pro-click/{session_id}")
+async def record_pro_click(
+    session_id: str,
+    body: dict,
+    user: dict = Depends(get_current_user),
+) -> dict:
+    """Record a click on a Pro upgrade CTA shown from inside a session."""
+    _check_session_access(session_id, user)
+    agent_session = session_manager.sessions.get(session_id)
+    if not agent_session:
+        raise HTTPException(status_code=404, detail="Session not found")
+    from agent.core import telemetry
+    await telemetry.record_pro_cta_click(
+        agent_session.session,
+        source=str(body.get("source") or "unknown"),
+        target=str(body.get("target") or "pro_pricing"),
+    )
+    if agent_session.session.config.save_sessions:
+        agent_session.session.save_and_upload_detached(
+            agent_session.session.config.session_dataset_repo
+        )
+    return {"status": "ok"}
 # ---------------------------------------------------------------------------
 # Shared SSE helpers
 # ---------------------------------------------------------------------------
             agent_session.session.config.session_dataset_repo
         )
     return {"status": "ok"}

backend/session_manager.py CHANGED Viewed

@@ -15,7 +15,7 @@ from agent.core.tools import ToolRouter
 # Get project root (parent of backend directory)
 PROJECT_ROOT = Path(__file__).parent.parent
-DEFAULT_CONFIG_PATH = str(PROJECT_ROOT / "configs" / "main_agent_config.json")
 # These dataclasses match agent/main.py structure

 # Get project root (parent of backend directory)
 PROJECT_ROOT = Path(__file__).parent.parent
+DEFAULT_CONFIG_PATH = str(PROJECT_ROOT / "configs" / "frontend_agent_config.json")
 # These dataclasses match agent/main.py structure

configs/cli_agent_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "model_name": "anthropic/claude-opus-4-6",
+  "save_sessions": true,
+  "session_dataset_repo": "smolagents/ml-intern-sessions",
+  "yolo_mode": false,
+  "confirm_cpu_jobs": true,
+  "auto_file_upload": true,
+  "mcpServers": {
+    "hf-mcp-server": {
+      "transport": "http",
+      "url": "https://huggingface.co/mcp?login"
+    }
+  }
+}

configs/{main_agent_config.json → frontend_agent_config.json} RENAMED Viewed

File without changes

frontend/src/components/Chat/ChatInput.tsx CHANGED Viewed

@@ -6,8 +6,9 @@ import StopIcon from '@mui/icons-material/Stop';
 import { apiFetch } from '@/utils/api';
 import { useUserQuota } from '@/hooks/useUserQuota';
 import ClaudeCapDialog from '@/components/ClaudeCapDialog';
 import { useAgentStore } from '@/store/agentStore';
-import { FIRST_FREE_MODEL_PATH } from '@/utils/model';
 // Model configuration
 interface ModelOption {
@@ -37,7 +38,7 @@ const MODEL_OPTIONS: ModelOption[] = [
     id: 'claude-opus',
     name: 'Claude Opus 4.6',
     description: 'Anthropic',
-    modelPath: 'anthropic/claude-opus-4-6',
     avatarUrl: 'https://huggingface.co/api/avatars/Anthropic',
     recommended: true,
   },
@@ -65,15 +66,17 @@ interface ChatInputProps {
   sessionId?: string;
   onSend: (text: string) => void;
   onStop?: () => void;
   isProcessing?: boolean;
   disabled?: boolean;
   placeholder?: string;
 }
-const isClaudeModel = (m: ModelOption) => m.modelPath.startsWith('anthropic/');
 const firstFreeModel = () => MODEL_OPTIONS.find(m => !isClaudeModel(m)) ?? MODEL_OPTIONS[0];
-export default function ChatInput({ sessionId, onSend, onStop, isProcessing = false, disabled = false, placeholder = 'Ask anything...' }: ChatInputProps) {
   const [input, setInput] = useState('');
   const inputRef = useRef<HTMLTextAreaElement>(null);
   const [selectedModelId, setSelectedModelId] = useState<string>(MODEL_OPTIONS[0].id);
@@ -86,6 +89,8 @@ export default function ChatInput({ sessionId, onSend, onStop, isProcessing = fa
   // the hook layer can flip it without threading props through.
   const claudeQuotaExhausted = useAgentStore((s) => s.claudeQuotaExhausted);
   const setClaudeQuotaExhausted = useAgentStore((s) => s.setClaudeQuotaExhausted);
   const lastSentRef = useRef<string>('');
   // Model is per-session: fetch this tab's current model every time the
@@ -197,6 +202,44 @@ export default function ChatInput({ sessionId, onSend, onStop, isProcessing = fa
     } catch { /* ignore */ }
   }, [sessionId, onSend, setClaudeQuotaExhausted]);
   // Hide the chip until the user has actually burned quota — an unused
   // Opus session shouldn't populate a counter.
   const claudeChip = (() => {
@@ -435,6 +478,17 @@ export default function ChatInput({ sessionId, onSend, onStop, isProcessing = fa
           cap={quota?.claudeDailyCap ?? 1}
           onClose={handleCapDialogClose}
           onUseFreeModel={handleUseFreeModel}
         />
       </Box>
     </Box>

 import { apiFetch } from '@/utils/api';
 import { useUserQuota } from '@/hooks/useUserQuota';
 import ClaudeCapDialog from '@/components/ClaudeCapDialog';
+import JobsUpgradeDialog from '@/components/JobsUpgradeDialog';
 import { useAgentStore } from '@/store/agentStore';
+import { CLAUDE_MODEL_PATH, FIRST_FREE_MODEL_PATH, isClaudePath } from '@/utils/model';
 // Model configuration
 interface ModelOption {
     id: 'claude-opus',
     name: 'Claude Opus 4.6',
     description: 'Anthropic',
+    modelPath: CLAUDE_MODEL_PATH,
     avatarUrl: 'https://huggingface.co/api/avatars/Anthropic',
     recommended: true,
   },
   sessionId?: string;
   onSend: (text: string) => void;
   onStop?: () => void;
+  onDeclineBlockedJobs?: () => Promise<boolean>;
+  onContinueBlockedJobsWithNamespace?: (namespace: string) => Promise<boolean>;
   isProcessing?: boolean;
   disabled?: boolean;
   placeholder?: string;
 }
+const isClaudeModel = (m: ModelOption) => isClaudePath(m.modelPath);
 const firstFreeModel = () => MODEL_OPTIONS.find(m => !isClaudeModel(m)) ?? MODEL_OPTIONS[0];
+export default function ChatInput({ sessionId, onSend, onStop, onDeclineBlockedJobs, onContinueBlockedJobsWithNamespace, isProcessing = false, disabled = false, placeholder = 'Ask anything...' }: ChatInputProps) {
   const [input, setInput] = useState('');
   const inputRef = useRef<HTMLTextAreaElement>(null);
   const [selectedModelId, setSelectedModelId] = useState<string>(MODEL_OPTIONS[0].id);
   // the hook layer can flip it without threading props through.
   const claudeQuotaExhausted = useAgentStore((s) => s.claudeQuotaExhausted);
   const setClaudeQuotaExhausted = useAgentStore((s) => s.setClaudeQuotaExhausted);
+  const jobsUpgradeRequired = useAgentStore((s) => s.jobsUpgradeRequired);
+  const setJobsUpgradeRequired = useAgentStore((s) => s.setJobsUpgradeRequired);
   const lastSentRef = useRef<string>('');
   // Model is per-session: fetch this tab's current model every time the
     } catch { /* ignore */ }
   }, [sessionId, onSend, setClaudeQuotaExhausted]);
+  const handleClaudeUpgradeClick = useCallback(async () => {
+    if (!sessionId) return;
+    try {
+      await apiFetch(`/api/pro-click/${sessionId}`, {
+        method: 'POST',
+        body: JSON.stringify({ source: 'claude_cap_dialog', target: 'pro_pricing' }),
+      });
+    } catch {
+      /* tracking is best-effort */
+    }
+  }, [sessionId]);
+  const handleJobsUpgradeClose = useCallback(() => {
+    setJobsUpgradeRequired(null);
+  }, [setJobsUpgradeRequired]);
+  const handleJobsUpgradeClick = useCallback(async () => {
+    if (!sessionId || !jobsUpgradeRequired) return;
+    try {
+      await apiFetch(`/api/pro-click/${sessionId}`, {
+        method: 'POST',
+        body: JSON.stringify({ source: 'hf_jobs_upgrade_dialog', target: 'pro_pricing' }),
+      });
+    } catch {
+      /* tracking is best-effort */
+    }
+  }, [sessionId, jobsUpgradeRequired]);
+  const handleDeclineBlockedJobs = useCallback(async () => {
+    if (!onDeclineBlockedJobs) return;
+    await onDeclineBlockedJobs();
+  }, [onDeclineBlockedJobs]);
+  const handleContinueBlockedJobsWithNamespace = useCallback(async (namespace: string) => {
+    if (!onContinueBlockedJobsWithNamespace) return;
+    await onContinueBlockedJobsWithNamespace(namespace);
+  }, [onContinueBlockedJobsWithNamespace]);
   // Hide the chip until the user has actually burned quota — an unused
   // Opus session shouldn't populate a counter.
   const claudeChip = (() => {
           cap={quota?.claudeDailyCap ?? 1}
           onClose={handleCapDialogClose}
           onUseFreeModel={handleUseFreeModel}
+          onUpgrade={handleClaudeUpgradeClick}
+        />
+        <JobsUpgradeDialog
+          open={!!jobsUpgradeRequired}
+          mode={jobsUpgradeRequired?.mode || 'upgrade'}
+          message={jobsUpgradeRequired?.message || ''}
+          eligibleNamespaces={jobsUpgradeRequired?.eligibleNamespaces || []}
+          onClose={handleJobsUpgradeClose}
+          onUpgrade={handleJobsUpgradeClick}
+          onDecline={handleDeclineBlockedJobs}
+          onContinueWithNamespace={handleContinueBlockedJobsWithNamespace}
         />
       </Box>
     </Box>

frontend/src/components/ClaudeCapDialog.tsx CHANGED Viewed

@@ -19,6 +19,7 @@ interface ClaudeCapDialogProps {
   cap: number;
   onClose: () => void;
   onUseFreeModel: () => void;
 }
 export default function ClaudeCapDialog({
@@ -27,6 +28,7 @@ export default function ClaudeCapDialog({
   cap,
   onClose,
   onUseFreeModel,
 }: ClaudeCapDialogProps) {
   // plan not surfaced in copy right now — Pro users see the same dialog and
   // can upgrade their org if they're also capped.
@@ -100,6 +102,7 @@ export default function ClaudeCapDialog({
           href={HF_PRICING_URL}
           target="_blank"
           rel="noopener noreferrer"
           variant="contained"
           size="small"
           sx={{

   cap: number;
   onClose: () => void;
   onUseFreeModel: () => void;
+  onUpgrade: () => void;
 }
 export default function ClaudeCapDialog({
   cap,
   onClose,
   onUseFreeModel,
+  onUpgrade,
 }: ClaudeCapDialogProps) {
   // plan not surfaced in copy right now — Pro users see the same dialog and
   // can upgrade their org if they're also capped.
           href={HF_PRICING_URL}
           target="_blank"
           rel="noopener noreferrer"
+          onClick={onUpgrade}
           variant="contained"
           size="small"
           sx={{

frontend/src/components/JobsUpgradeDialog.tsx ADDED Viewed

	@@ -0,0 +1,191 @@

+import { useEffect, useState } from 'react';
+import {
+  Box,
+  Button,
+  Dialog,
+  DialogActions,
+  DialogContent,
+  DialogContentText,
+  DialogTitle,
+  FormControl,
+  InputLabel,
+  MenuItem,
+  Select,
+  Typography,
+} from '@mui/material';
+const HF_PRICING_URL = 'https://huggingface.co/pricing';
+interface JobsUpgradeDialogProps {
+  open: boolean;
+  mode: 'upgrade' | 'namespace';
+  message: string;
+  eligibleNamespaces: string[];
+  onUpgrade: () => void;
+  onDecline: () => void;
+  onClose: () => void;
+  onContinueWithNamespace: (namespace: string) => void;
+}
+export default function JobsUpgradeDialog({
+  open,
+  mode,
+  message,
+  eligibleNamespaces,
+  onUpgrade,
+  onDecline,
+  onClose,
+  onContinueWithNamespace,
+}: JobsUpgradeDialogProps) {
+  const [selectedNamespace, setSelectedNamespace] = useState('');
+  useEffect(() => {
+    if (!open) return;
+    setSelectedNamespace(eligibleNamespaces[0] || '');
+  }, [open, eligibleNamespaces]);
+  return (
+    <Dialog
+      open={open}
+      onClose={onClose}
+      slotProps={{
+        backdrop: { sx: { backgroundColor: 'rgba(0,0,0,0.5)', backdropFilter: 'blur(4px)' } },
+      }}
+      PaperProps={{
+        sx: {
+          bgcolor: 'var(--panel)',
+          border: '1px solid var(--border)',
+          borderRadius: 'var(--radius-md)',
+          boxShadow: 'var(--shadow-1)',
+          maxWidth: 500,
+          mx: 2,
+        },
+      }}
+    >
+      <DialogTitle
+        sx={{ color: 'var(--text)', fontWeight: 700, fontSize: '1rem', pt: 2.5, pb: 0, px: 3 }}
+      >
+        {mode === 'namespace' ? 'Choose the org for this job' : 'Jobs need Pro or a paid org'}
+      </DialogTitle>
+      <DialogContent sx={{ px: 3, pt: 1.25, pb: 0 }}>
+        <DialogContentText
+          sx={{ color: 'var(--muted-text)', fontSize: '0.85rem', lineHeight: 1.6 }}
+        >
+          {message}
+        </DialogContentText>
+        {eligibleNamespaces.length > 0 && (
+          <Box
+            sx={{
+              mt: 2,
+              p: 1.5,
+              borderRadius: '8px',
+              bgcolor: 'var(--accent-yellow-weak)',
+              border: '1px solid var(--border)',
+            }}
+          >
+            <Typography
+              variant="caption"
+              sx={{
+                display: 'block',
+                fontWeight: 700,
+                color: 'var(--text)',
+                fontSize: '0.78rem',
+                mb: 1,
+                letterSpacing: '0.02em',
+              }}
+            >
+              Eligible namespaces
+            </Typography>
+            {mode === 'namespace' ? (
+              <FormControl fullWidth size="small">
+                <InputLabel id="jobs-namespace-label">Organization</InputLabel>
+                <Select
+                  labelId="jobs-namespace-label"
+                  value={selectedNamespace}
+                  label="Organization"
+                  onChange={(e) => setSelectedNamespace(String(e.target.value))}
+                >
+                  {eligibleNamespaces.map((namespace) => (
+                    <MenuItem key={namespace} value={namespace}>
+                      {namespace}
+                    </MenuItem>
+                  ))}
+                </Select>
+              </FormControl>
+            ) : (
+              <Typography
+                variant="caption"
+                sx={{ display: 'block', color: 'var(--muted-text)', fontSize: '0.78rem', lineHeight: 1.55 }}
+              >
+                {eligibleNamespaces.join(', ')}
+              </Typography>
+            )}
+          </Box>
+        )}
+        <Typography
+          variant="caption"
+          sx={{ display: 'block', mt: 2, color: 'var(--muted-text)', fontSize: '0.78rem', lineHeight: 1.55 }}
+        >
+          If you decline, the agent will have to find another way forward without `hf_jobs`.
+        </Typography>
+      </DialogContent>
+      <DialogActions sx={{ px: 3, pb: 2.5, pt: 2, gap: 1 }}>
+        {mode === 'namespace' ? (
+          <Button
+            onClick={() => onContinueWithNamespace(selectedNamespace)}
+            disabled={!selectedNamespace}
+            variant="contained"
+            size="small"
+            sx={{
+              fontSize: '0.82rem',
+              px: 2.5,
+              bgcolor: 'var(--accent-yellow)',
+              color: '#000',
+              textTransform: 'none',
+              fontWeight: 700,
+              boxShadow: 'none',
+              '&:hover': { bgcolor: '#FFB340', boxShadow: 'none' },
+            }}
+          >
+            Run under selected org
+          </Button>
+        ) : (
+          <Button
+            component="a"
+            href={HF_PRICING_URL}
+            target="_blank"
+            rel="noopener noreferrer"
+            onClick={onUpgrade}
+            variant="contained"
+            size="small"
+            sx={{
+              fontSize: '0.82rem',
+              px: 2.5,
+              bgcolor: 'var(--accent-yellow)',
+              color: '#000',
+              textTransform: 'none',
+              fontWeight: 700,
+              boxShadow: 'none',
+              '&:hover': { bgcolor: '#FFB340', boxShadow: 'none' },
+            }}
+          >
+            Upgrade to Pro
+          </Button>
+        )}
+        <Button
+          onClick={onDecline}
+          size="small"
+          sx={{
+            color: 'var(--muted-text)',
+            fontSize: '0.82rem',
+            px: 2,
+            textTransform: 'none',
+            '&:hover': { bgcolor: 'var(--hover-bg)' },
+          }}
+        >
+          Decline tool call
+        </Button>
+      </DialogActions>
+    </Dialog>
+  );
+}

frontend/src/components/SessionChat.tsx CHANGED Viewed

@@ -26,7 +26,7 @@ export default function SessionChat({ sessionId, isActive, onSessionDead }: Sess
   const { updateSessionTitle, sessions } = useSessionStore();
   const isExpired = sessions.find((s) => s.id === sessionId)?.expired === true;
-  const { messages, sendMessage, stop, status, undoLastTurn, editAndRegenerate, approveTools } = useAgentChat({
     sessionId,
     isActive,
     onReady: () => logger.log(`Session ${sessionId} ready`),
@@ -114,6 +114,8 @@ export default function SessionChat({ sessionId, isActive, onSessionDead }: Sess
           sessionId={sessionId}
           onSend={handleSendMessage}
           onStop={handleStop}
           isProcessing={busy}
           disabled={!isConnected || activityStatus.type === 'waiting-approval'}
           placeholder={

   const { updateSessionTitle, sessions } = useSessionStore();
   const isExpired = sessions.find((s) => s.id === sessionId)?.expired === true;
+  const { messages, sendMessage, stop, status, undoLastTurn, editAndRegenerate, approveTools, declineBlockedJobs, continueBlockedJobsWithNamespace } = useAgentChat({
     sessionId,
     isActive,
     onReady: () => logger.log(`Session ${sessionId} ready`),
           sessionId={sessionId}
           onSend={handleSendMessage}
           onStop={handleStop}
+          onDeclineBlockedJobs={declineBlockedJobs}
+          onContinueBlockedJobsWithNamespace={continueBlockedJobsWithNamespace}
           isProcessing={busy}
           disabled={!isConnected || activityStatus.type === 'waiting-approval'}
           placeholder={

frontend/src/hooks/useAgentChat.ts CHANGED Viewed

@@ -330,6 +330,49 @@ export function useAgentChat({ sessionId, isActive, onReady, onError, onSessionD
     messages: UIMessage[];
   }>({ setMessages: null, messages: [] });
   // -- useChat from Vercel AI SDK -----------------------------------------
   const chat = useChat({
     id: sessionId,
@@ -354,6 +397,56 @@ export function useAgentChat({ sessionId, isActive, onReady, onError, onSessionD
         }
         return;
       }
       logger.error('useChat error:', error);
       if (isActiveRef.current) {
         useAgentStore.getState().setError(error.message);
@@ -672,12 +765,15 @@ export function useAgentChat({ sessionId, isActive, onReady, onError, onSessionD
   // -- Approve tools ------------------------------------------------------
   const approveTools = useCallback(
-    async (approvals: Array<{ tool_call_id: string; approved: boolean; feedback?: string | null; edited_script?: string | null }>) => {
       // Store edited scripts so the transport can read them when sendMessages is called
       for (const a of approvals) {
         if (a.edited_script) {
           useAgentStore.getState().setEditedScript(a.tool_call_id, a.edited_script);
         }
       }
       // Update SDK tool state — this triggers sendMessages() via the transport
@@ -707,6 +803,37 @@ export function useAgentChat({ sessionId, isActive, onReady, onError, onSessionD
     [sessionId, chat, updateSession, setNeedsAttention],
   );
   // -- Stop (interrupt backend agent loop, keep SSE open for events) --------
   const stop = useCallback(() => {
     // Don't call chat.stop() — keep the SSE stream open so the backend's
@@ -763,5 +890,7 @@ export function useAgentChat({ sessionId, isActive, onReady, onError, onSessionD
     undoLastTurn,
     editAndRegenerate,
     approveTools,
   };
 }

     messages: UIMessage[];
   }>({ setMessages: null, messages: [] });
+  const hydrateFromBackend = useCallback(async () => {
+    try {
+      const [msgsRes, infoRes] = await Promise.all([
+        apiFetch(`/api/session/${sessionId}/messages`),
+        apiFetch(`/api/session/${sessionId}`),
+      ]);
+      if (!msgsRes.ok) return null;
+      const data = await msgsRes.json();
+      if (!Array.isArray(data) || data.length === 0) return null;
+      saveBackendMessages(sessionId, data);
+      let pendingIds: Set<string> | undefined;
+      let info: Record<string, unknown> | null = null;
+      if (infoRes.ok) {
+        info = await infoRes.json();
+        const pendingApproval = info?.pending_approval;
+        if (pendingApproval && Array.isArray(pendingApproval)) {
+          pendingIds = new Set(
+            pendingApproval.map((t: { tool_call_id: string }) => t.tool_call_id),
+          );
+          if (pendingIds.size > 0) {
+            setNeedsAttention(sessionId, true);
+          }
+        }
+      }
+      const uiMsgs = llmMessagesToUIMessages(data, pendingIds, chatActionsRef.current.messages);
+      if (uiMsgs.length > 0) {
+        chatActionsRef.current.setMessages?.(uiMsgs);
+        saveMessages(sessionId, uiMsgs);
+      }
+      if (pendingIds && pendingIds.size > 0) {
+        updateSession(sessionId, { activityStatus: { type: 'waiting-approval' }, isProcessing: false });
+      }
+      return { data, pendingIds, info };
+    } catch {
+      return null;
+    }
+  }, [sessionId, setNeedsAttention]);
   // -- useChat from Vercel AI SDK -----------------------------------------
   const chat = useChat({
     id: sessionId,
         }
         return;
       }
+      if (error.message === 'HF_JOBS_UPGRADE_REQUIRED') {
+        const typed = error as Error & {
+          detail?: Record<string, unknown>;
+          approvals?: Array<{
+            tool_call_id: string;
+            approved: boolean;
+            feedback?: string | null;
+            edited_script?: string | null;
+          }>;
+        };
+        void hydrateFromBackend();
+        if (isActiveRef.current) {
+          useAgentStore.getState().setJobsUpgradeRequired({
+            approvals: typed.approvals || [],
+            toolCallIds: (typed.detail?.tool_call_ids as string[]) || [],
+            message: String(
+              typed.detail?.message
+                || 'Hugging Face Jobs are available only to Pro users and Team or Enterprise organizations.',
+            ),
+            eligibleNamespaces: (typed.detail?.eligible_namespaces as string[]) || [],
+            plan: ((typed.detail?.plan as 'free' | 'pro' | 'org') || 'free'),
+            mode: 'upgrade',
+          });
+        }
+        return;
+      }
+      if (error.message === 'HF_JOBS_NAMESPACE_REQUIRED') {
+        const typed = error as Error & {
+          detail?: Record<string, unknown>;
+          approvals?: Array<{
+            tool_call_id: string;
+            approved: boolean;
+            feedback?: string | null;
+            edited_script?: string | null;
+            namespace?: string | null;
+          }>;
+        };
+        void hydrateFromBackend();
+        if (isActiveRef.current) {
+          useAgentStore.getState().setJobsUpgradeRequired({
+            approvals: typed.approvals || [],
+            toolCallIds: (typed.detail?.tool_call_ids as string[]) || [],
+            message: String(typed.detail?.message || 'Choose which organization should own this job run.'),
+            eligibleNamespaces: (typed.detail?.eligible_namespaces as string[]) || [],
+            plan: ((typed.detail?.plan as 'free' | 'pro' | 'org') || 'free'),
+            mode: 'namespace',
+          });
+        }
+        return;
+      }
       logger.error('useChat error:', error);
       if (isActiveRef.current) {
         useAgentStore.getState().setError(error.message);
   // -- Approve tools ------------------------------------------------------
   const approveTools = useCallback(
+    async (approvals: Array<{ tool_call_id: string; approved: boolean; feedback?: string | null; edited_script?: string | null; namespace?: string | null }>) => {
       // Store edited scripts so the transport can read them when sendMessages is called
       for (const a of approvals) {
         if (a.edited_script) {
           useAgentStore.getState().setEditedScript(a.tool_call_id, a.edited_script);
         }
+        if (a.namespace) {
+          useAgentStore.getState().setApprovalNamespace(a.tool_call_id, a.namespace);
+        }
       }
       // Update SDK tool state — this triggers sendMessages() via the transport
     [sessionId, chat, updateSession, setNeedsAttention],
   );
+  const declineBlockedJobs = useCallback(async () => {
+    const blocked = useAgentStore.getState().jobsUpgradeRequired;
+    if (!blocked) return false;
+    const approvals = blocked.approvals.map((approval) => ({
+      ...approval,
+      approved: blocked.toolCallIds.includes(approval.tool_call_id) ? false : approval.approved,
+      feedback: blocked.toolCallIds.includes(approval.tool_call_id)
+        ? 'Rejected because this account cannot launch Hugging Face Jobs.'
+        : approval.feedback,
+    }));
+    useAgentStore.getState().setJobsUpgradeRequired(null);
+    return approveTools(approvals);
+  }, [approveTools]);
+  const continueBlockedJobsWithNamespace = useCallback(async (namespace: string) => {
+    const blocked = useAgentStore.getState().jobsUpgradeRequired;
+    if (!blocked) return false;
+    const approvals = blocked.approvals.map((approval) => ({
+      ...approval,
+      namespace: blocked.toolCallIds.includes(approval.tool_call_id)
+        ? namespace
+        : approval.namespace,
+    }));
+    useAgentStore.getState().setJobsUpgradeRequired(null);
+    return approveTools(approvals);
+  }, [approveTools]);
   // -- Stop (interrupt backend agent loop, keep SSE open for events) --------
   const stop = useCallback(() => {
     // Don't call chat.stop() — keep the SSE stream open so the backend's
     undoLastTurn,
     editAndRegenerate,
     approveTools,
+    declineBlockedJobs,
+    continueBlockedJobsWithNamespace,
   };
 }

frontend/src/lib/sse-chat-transport.ts CHANGED Viewed

@@ -320,11 +320,13 @@ export class SSEChatTransport implements ChatTransport<UIMessage> {
         const approved = p.approval?.approved ?? true;
         // Get edited script from agentStore if available
         const editedScript = useAgentStore.getState().getEditedScript(p.toolCallId);
         return {
           tool_call_id: p.toolCallId,
           approved,
           feedback: approved ? null : (p.approval?.reason || 'Rejected by user'),
           edited_script: editedScript ?? null,
         };
       }).filter(Boolean);
       body = { approvals };
@@ -362,6 +364,30 @@ export class SSEChatTransport implements ChatTransport<UIMessage> {
       // instead of a generic error banner.
       throw new Error('CLAUDE_QUOTA_EXHAUSTED');
     }
     if (!response.ok) {
       const errorText = await response.text().catch(() => 'Request failed');
       throw new Error(`Chat request failed: ${response.status} ${errorText}`);

         const approved = p.approval?.approved ?? true;
         // Get edited script from agentStore if available
         const editedScript = useAgentStore.getState().getEditedScript(p.toolCallId);
+        const namespace = useAgentStore.getState().getApprovalNamespace(p.toolCallId);
         return {
           tool_call_id: p.toolCallId,
           approved,
           feedback: approved ? null : (p.approval?.reason || 'Rejected by user'),
           edited_script: editedScript ?? null,
+          namespace: namespace ?? null,
         };
       }).filter(Boolean);
       body = { approvals };
       // instead of a generic error banner.
       throw new Error('CLAUDE_QUOTA_EXHAUSTED');
     }
+    if (response.status === 402) {
+      const payload = await response.json().catch(() => null);
+      if (payload?.detail?.error === 'hf_jobs_upgrade_required') {
+        const err = new Error('HF_JOBS_UPGRADE_REQUIRED') as Error & {
+          detail?: Record<string, unknown>;
+          approvals?: Array<Record<string, unknown>>;
+        };
+        err.detail = payload.detail as Record<string, unknown>;
+        err.approvals = (body.approvals as Array<Record<string, unknown>> | undefined) || [];
+        throw err;
+      }
+    }
+    if (response.status === 409) {
+      const payload = await response.json().catch(() => null);
+      if (payload?.detail?.error === 'hf_jobs_namespace_required') {
+        const err = new Error('HF_JOBS_NAMESPACE_REQUIRED') as Error & {
+          detail?: Record<string, unknown>;
+          approvals?: Array<Record<string, unknown>>;
+        };
+        err.detail = payload.detail as Record<string, unknown>;
+        err.approvals = (body.approvals as Array<Record<string, unknown>> | undefined) || [];
+        throw err;
+      }
+    }
     if (!response.ok) {
       const errorText = await response.text().catch(() => 'Request failed');
       throw new Error(`Chat request failed: ${response.status} ${errorText}`);

frontend/src/store/agentStore.ts CHANGED Viewed

@@ -45,6 +45,21 @@ export interface LLMHealthError {
   model: string;
 }
 export type ActivityStatus =
   | { type: 'idle' }
   | { type: 'thinking' }
@@ -110,6 +125,7 @@ interface AgentStore {
   llmHealthError: LLMHealthError | null;
   /** Set when a Claude-send hits the daily quota — ChatInput opens the cap dialog in response. */
   claudeQuotaExhausted: boolean;
   // Right panel (single-artifact pattern)
   panelData: PanelData | null;
@@ -122,6 +138,9 @@ interface AgentStore {
   // Edited scripts (tool_call_id -> edited content)
   editedScripts: Record<string, string>;
   // Job URLs (tool_call_id -> job URL) for HF jobs
   jobUrls: Record<string, string>;
@@ -156,6 +175,7 @@ interface AgentStore {
   setError: (error: string | null) => void;
   setLlmHealthError: (error: LLMHealthError | null) => void;
   setClaudeQuotaExhausted: (exhausted: boolean) => void;
   setPanel: (data: PanelData, view?: PanelView, editable?: boolean) => void;
   setPanelView: (view: PanelView) => void;
@@ -170,6 +190,10 @@ interface AgentStore {
   getEditedScript: (toolCallId: string) => string | undefined;
   clearEditedScripts: () => void;
   setJobUrl: (toolCallId: string, jobUrl: string) => void;
   getJobUrl: (toolCallId: string) => string | undefined;
@@ -251,6 +275,7 @@ export const useAgentStore = create<AgentStore>()((set, get) => ({
   error: null,
   llmHealthError: null,
   claudeQuotaExhausted: false,
   panelData: null,
   panelView: 'script',
@@ -259,6 +284,7 @@ export const useAgentStore = create<AgentStore>()((set, get) => ({
   plan: [],
   editedScripts: {},
   jobUrls: {},
   jobStatuses: {},
   toolErrors: loadToolErrors(),
@@ -363,6 +389,7 @@ export const useAgentStore = create<AgentStore>()((set, get) => ({
   setError: (error) => set({ error }),
   setLlmHealthError: (error) => set({ llmHealthError: error }),
   setClaudeQuotaExhausted: (exhausted) => set({ claudeQuotaExhausted: exhausted }),
   // ── Panel (single-artifact) ───────────────────────────────────────
   // Each setter also patches the active session's snapshot so that
@@ -428,6 +455,16 @@ export const useAgentStore = create<AgentStore>()((set, get) => ({
   clearEditedScripts: () => set({ editedScripts: {} }),
   // ── Job URLs ────────────────────────────────────────────────────────
   setJobUrl: (toolCallId, jobUrl) => {

   model: string;
 }
+export interface JobsUpgradeState {
+  approvals: Array<{
+    tool_call_id: string;
+    approved: boolean;
+    feedback?: string | null;
+    edited_script?: string | null;
+    namespace?: string | null;
+  }>;
+  toolCallIds: string[];
+  message: string;
+  eligibleNamespaces: string[];
+  plan: 'free' | 'pro' | 'org';
+  mode: 'upgrade' | 'namespace';
+}
 export type ActivityStatus =
   | { type: 'idle' }
   | { type: 'thinking' }
   llmHealthError: LLMHealthError | null;
   /** Set when a Claude-send hits the daily quota — ChatInput opens the cap dialog in response. */
   claudeQuotaExhausted: boolean;
+  jobsUpgradeRequired: JobsUpgradeState | null;
   // Right panel (single-artifact pattern)
   panelData: PanelData | null;
   // Edited scripts (tool_call_id -> edited content)
   editedScripts: Record<string, string>;
+  // Namespace overrides chosen for hf_jobs approvals (tool_call_id -> namespace)
+  approvalNamespaces: Record<string, string>;
   // Job URLs (tool_call_id -> job URL) for HF jobs
   jobUrls: Record<string, string>;
   setError: (error: string | null) => void;
   setLlmHealthError: (error: LLMHealthError | null) => void;
   setClaudeQuotaExhausted: (exhausted: boolean) => void;
+  setJobsUpgradeRequired: (state: JobsUpgradeState | null) => void;
   setPanel: (data: PanelData, view?: PanelView, editable?: boolean) => void;
   setPanelView: (view: PanelView) => void;
   getEditedScript: (toolCallId: string) => string | undefined;
   clearEditedScripts: () => void;
+  setApprovalNamespace: (toolCallId: string, namespace: string) => void;
+  getApprovalNamespace: (toolCallId: string) => string | undefined;
+  clearApprovalNamespaces: () => void;
   setJobUrl: (toolCallId: string, jobUrl: string) => void;
   getJobUrl: (toolCallId: string) => string | undefined;
   error: null,
   llmHealthError: null,
   claudeQuotaExhausted: false,
+  jobsUpgradeRequired: null,
   panelData: null,
   panelView: 'script',
   plan: [],
   editedScripts: {},
+  approvalNamespaces: {},
   jobUrls: {},
   jobStatuses: {},
   toolErrors: loadToolErrors(),
   setError: (error) => set({ error }),
   setLlmHealthError: (error) => set({ llmHealthError: error }),
   setClaudeQuotaExhausted: (exhausted) => set({ claudeQuotaExhausted: exhausted }),
+  setJobsUpgradeRequired: (state) => set({ jobsUpgradeRequired: state }),
   // ── Panel (single-artifact) ───────────────────────────────────────
   // Each setter also patches the active session's snapshot so that
   clearEditedScripts: () => set({ editedScripts: {} }),
+  setApprovalNamespace: (toolCallId, namespace) => {
+    set((state) => ({
+      approvalNamespaces: { ...state.approvalNamespaces, [toolCallId]: namespace },
+    }));
+  },
+  getApprovalNamespace: (toolCallId) => get().approvalNamespaces[toolCallId],
+  clearApprovalNamespaces: () => set({ approvalNamespaces: {} }),
   // ── Job URLs ────────────────────────────────────────────────────────
   setJobUrl: (toolCallId, jobUrl) => {

frontend/src/types/agent.ts CHANGED Viewed

@@ -27,6 +27,7 @@ export interface ToolApproval {
   tool_call_id: string;
   approved: boolean;
   feedback?: string | null;
 }
 export interface User {

   tool_call_id: string;
   approved: boolean;
   feedback?: string | null;
+  namespace?: string | null;
 }
 export interface User {

frontend/src/utils/model.ts CHANGED Viewed

@@ -3,13 +3,12 @@
  * ClaudeCapDialog "Use a free model" escape hatch.
  *
  * Keep in sync with MODEL_OPTIONS in components/Chat/ChatInput.tsx and
- * AVAILABLE_MODELS in backend/routes/agent.py. Bare HF ids (no
- * `huggingface/` prefix) — matches upstream's auto-router.
  */
-export const CLAUDE_MODEL_PATH = 'anthropic/claude-opus-4-6';
 export const FIRST_FREE_MODEL_PATH = 'moonshotai/Kimi-K2.6';
 export function isClaudePath(modelPath: string | undefined): boolean {
-  return !!modelPath && modelPath.startsWith('anthropic/');
 }

  * ClaudeCapDialog "Use a free model" escape hatch.
  *
  * Keep in sync with MODEL_OPTIONS in components/Chat/ChatInput.tsx and
+ * AVAILABLE_MODELS in backend/routes/agent.py.
  */
+export const CLAUDE_MODEL_PATH = 'bedrock/us.anthropic.claude-opus-4-6-v1';
 export const FIRST_FREE_MODEL_PATH = 'moonshotai/Kimi-K2.6';
 export function isClaudePath(modelPath: string | undefined): boolean {
+  return !!modelPath && modelPath.includes('anthropic');
 }

pyproject.toml CHANGED Viewed

@@ -1,5 +1,5 @@
 [project]
-name = "hf-agent"
 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
@@ -42,11 +42,12 @@ eval = [
 # Development and testing dependencies
 dev = [
     "pytest>=9.0.2",
 ]
 # All dependencies (eval + dev)
 all = [
-    "hf-agent[eval,dev]",
 ]
 [project.scripts]
@@ -61,3 +62,6 @@ include = ["agent*"]
 [tool.uv]
 package = true

 [project]
+name = "ml-intern"
 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
 # Development and testing dependencies
 dev = [
     "pytest>=9.0.2",
+    "pytest-asyncio>=0.26.0",
 ]
 # All dependencies (eval + dev)
 all = [
+    "ml-intern[eval,dev]",
 ]
 [project.scripts]
 [tool.uv]
 package = true
+[tool.pytest.ini_options]
+asyncio_mode = "auto"

scripts/build_kpis.py CHANGED Viewed

@@ -44,7 +44,8 @@ re-running the same hour overwrites.
     regenerate_rate     — sessions with any `undo_complete` event / sessions
     time_to_first_action_s_p50 / _p95  — from session_start to first tool_call
     thumbs_up / thumbs_down
-    hf_jobs_submitted / _succeeded
     gpu_hours_by_flavor_json   — JSON-serialised {flavor: gpu-hours}
 ================================================================================
@@ -210,7 +211,8 @@ def _session_metrics(session: dict) -> dict:
         "tool_calls_total": 0, "tool_calls_success": 0,
         "failures": 0, "regenerate_sessions": 0,
         "thumbs_up": 0, "thumbs_down": 0,
-        "hf_jobs_submitted": 0, "hf_jobs_succeeded": 0,
         "first_tool_s": -1,
     }
     events = session.get("events") or []
@@ -229,8 +231,11 @@ def _session_metrics(session: dict) -> dict:
     gpu_hours_by_flavor: dict[str, float] = defaultdict(float)
     jobs_submitted = 0
     jobs_succeeded = 0
     thumbs_up = 0
     thumbs_down = 0
     start_dt = _parse_ts(session_start)
@@ -283,6 +288,14 @@ def _session_metrics(session: dict) -> dict:
             if status in ("completed", "succeeded", "success"):
                 jobs_succeeded += 1
     out["tool_calls_total"] = tool_total
     out["tool_calls_success"] = tool_success
     out["failures"] = 1 if had_error else 0
@@ -291,8 +304,11 @@ def _session_metrics(session: dict) -> dict:
     out["thumbs_down"] = thumbs_down
     out["hf_jobs_submitted"] = jobs_submitted
     out["hf_jobs_succeeded"] = jobs_succeeded
     out["first_tool_s"] = first_tool_ts if first_tool_ts is not None else -1
     out["_gpu_hours_by_flavor"] = dict(gpu_hours_by_flavor)
     out["_user"] = session.get("user_id") or session.get("session_id")
     return dict(out)
@@ -301,9 +317,12 @@ def _aggregate(per_session: list[dict]) -> dict:
     """Collapse a bucket's worth of session rollups into the final KPI row."""
     ttfa_values = [s["first_tool_s"] for s in per_session if s.get("first_tool_s", -1) >= 0]
     gpu_hours: dict[str, float] = defaultdict(float)
     for s in per_session:
         for f, h in (s.get("_gpu_hours_by_flavor") or {}).items():
             gpu_hours[f] += h
     total_sessions = sum(s["sessions"] for s in per_session)
     total_turns = sum(s["turns"] for s in per_session)
@@ -340,7 +359,10 @@ def _aggregate(per_session: list[dict]) -> dict:
         "thumbs_down": int(sum(s["thumbs_down"] for s in per_session)),
         "hf_jobs_submitted": int(sum(s["hf_jobs_submitted"] for s in per_session)),
         "hf_jobs_succeeded": int(sum(s["hf_jobs_succeeded"] for s in per_session)),
         "gpu_hours_by_flavor_json": json.dumps(dict(gpu_hours), sort_keys=True),
     }

     regenerate_rate     — sessions with any `undo_complete` event / sessions
     time_to_first_action_s_p50 / _p95  — from session_start to first tool_call
     thumbs_up / thumbs_down
+    hf_jobs_submitted / _succeeded / _blocked
+    pro_cta_clicks
     gpu_hours_by_flavor_json   — JSON-serialised {flavor: gpu-hours}
 ================================================================================
         "tool_calls_total": 0, "tool_calls_success": 0,
         "failures": 0, "regenerate_sessions": 0,
         "thumbs_up": 0, "thumbs_down": 0,
+        "hf_jobs_submitted": 0, "hf_jobs_succeeded": 0, "hf_jobs_blocked": 0,
+        "pro_cta_clicks": 0,
         "first_tool_s": -1,
     }
     events = session.get("events") or []
     gpu_hours_by_flavor: dict[str, float] = defaultdict(float)
     jobs_submitted = 0
     jobs_succeeded = 0
+    jobs_blocked = 0
     thumbs_up = 0
     thumbs_down = 0
+    pro_cta_clicks = 0
+    pro_cta_by_source: dict[str, int] = defaultdict(int)
     start_dt = _parse_ts(session_start)
             if status in ("completed", "succeeded", "success"):
                 jobs_succeeded += 1
+        elif et == "jobs_access_blocked":
+            jobs_blocked += 1
+        elif et == "pro_cta_click":
+            pro_cta_clicks += 1
+            source = str(data.get("source") or "unknown")
+            pro_cta_by_source[source] += 1
     out["tool_calls_total"] = tool_total
     out["tool_calls_success"] = tool_success
     out["failures"] = 1 if had_error else 0
     out["thumbs_down"] = thumbs_down
     out["hf_jobs_submitted"] = jobs_submitted
     out["hf_jobs_succeeded"] = jobs_succeeded
+    out["hf_jobs_blocked"] = jobs_blocked
+    out["pro_cta_clicks"] = pro_cta_clicks
     out["first_tool_s"] = first_tool_ts if first_tool_ts is not None else -1
     out["_gpu_hours_by_flavor"] = dict(gpu_hours_by_flavor)
+    out["_pro_cta_by_source"] = dict(pro_cta_by_source)
     out["_user"] = session.get("user_id") or session.get("session_id")
     return dict(out)
     """Collapse a bucket's worth of session rollups into the final KPI row."""
     ttfa_values = [s["first_tool_s"] for s in per_session if s.get("first_tool_s", -1) >= 0]
     gpu_hours: dict[str, float] = defaultdict(float)
+    pro_cta_by_source: dict[str, int] = defaultdict(int)
     for s in per_session:
         for f, h in (s.get("_gpu_hours_by_flavor") or {}).items():
             gpu_hours[f] += h
+        for source, count in (s.get("_pro_cta_by_source") or {}).items():
+            pro_cta_by_source[source] += int(count)
     total_sessions = sum(s["sessions"] for s in per_session)
     total_turns = sum(s["turns"] for s in per_session)
         "thumbs_down": int(sum(s["thumbs_down"] for s in per_session)),
         "hf_jobs_submitted": int(sum(s["hf_jobs_submitted"] for s in per_session)),
         "hf_jobs_succeeded": int(sum(s["hf_jobs_succeeded"] for s in per_session)),
+        "hf_jobs_blocked": int(sum(s["hf_jobs_blocked"] for s in per_session)),
+        "pro_cta_clicks": int(sum(s["pro_cta_clicks"] for s in per_session)),
         "gpu_hours_by_flavor_json": json.dumps(dict(gpu_hours), sort_keys=True),
+        "pro_cta_by_source_json": json.dumps(dict(pro_cta_by_source), sort_keys=True),
     }

tests/unit/test_build_kpis.py CHANGED Viewed

@@ -88,6 +88,22 @@ def test_hf_job_gpu_hours():
     assert abs(m["_gpu_hours_by_flavor"]["a100-large"] - 1.0) < 1e-6
 def test_feedback_counts():
     mod = _load()
     events = [
@@ -120,6 +136,22 @@ def test_aggregate_day_cache_hit_and_users():
     assert abs(row["cost_usd"] - 1.5) < 1e-9
 def test_failure_and_regenerate_rates():
     mod = _load()
     s1 = mod._session_metrics(_session([_ev("error", {"error": "boom"})], user_id="a"))

     assert abs(m["_gpu_hours_by_flavor"]["a100-large"] - 1.0) < 1e-6
+def test_hf_job_blocked_and_pro_clicks_are_counted():
+    mod = _load()
+    events = [
+        _ev("jobs_access_blocked", {"tool_call_ids": ["tc1"], "plan": "free"}),
+        _ev("pro_cta_click", {"source": "hf_jobs_upgrade_dialog"}),
+        _ev("pro_cta_click", {"source": "claude_cap_dialog"}),
+    ]
+    m = mod._session_metrics(_session(events))
+    assert m["hf_jobs_blocked"] == 1
+    assert m["pro_cta_clicks"] == 2
+    assert m["_pro_cta_by_source"] == {
+        "hf_jobs_upgrade_dialog": 1,
+        "claude_cap_dialog": 1,
+    }
 def test_feedback_counts():
     mod = _load()
     events = [
     assert abs(row["cost_usd"] - 1.5) < 1e-9
+def test_aggregate_day_sums_pro_click_sources():
+    mod = _load()
+    s1 = mod._session_metrics(_session([
+        _ev("pro_cta_click", {"source": "hf_jobs_upgrade_dialog"}),
+        _ev("pro_cta_click", {"source": "hf_jobs_upgrade_dialog"}),
+    ], user_id="u1"))
+    s2 = mod._session_metrics(_session([
+        _ev("pro_cta_click", {"source": "claude_cap_dialog"}),
+    ], user_id="u2"))
+    row = mod._aggregate_day([s1, s2])
+    assert row["pro_cta_clicks"] == 3
+    assert row["pro_cta_by_source_json"] == (
+        '{"claude_cap_dialog": 1, "hf_jobs_upgrade_dialog": 2}'
+    )
 def test_failure_and_regenerate_rates():
     mod = _load()
     s1 = mod._session_metrics(_session([_ev("error", {"error": "boom"})], user_id="a"))

tests/unit/test_cli_rendering.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""Regression tests for interactive CLI rendering and research model routing."""
+from io import StringIO
+from types import SimpleNamespace
+from agent.tools.research_tool import _get_research_model
+from agent.utils import terminal_display
+def test_direct_anthropic_research_model_stays_off_bedrock():
+    assert _get_research_model("anthropic/claude-opus-4-6") == "anthropic/claude-sonnet-4-6"
+def test_bedrock_anthropic_research_model_stays_on_bedrock():
+    assert (
+        _get_research_model("bedrock/us.anthropic.claude-opus-4-6-v1")
+        == "bedrock/us.anthropic.claude-sonnet-4-6"
+    )
+def test_non_anthropic_research_model_is_unchanged():
+    assert _get_research_model("openai/gpt-5.4") == "openai/gpt-5.4"
+def test_subagent_display_does_not_spawn_background_redraw(monkeypatch):
+    calls: list[object] = []
+    def _unexpected_future(*args, **kwargs):
+        calls.append((args, kwargs))
+        raise AssertionError("background redraw task should not be created")
+    monkeypatch.setattr("asyncio.ensure_future", _unexpected_future)
+    monkeypatch.setattr(
+        terminal_display,
+        "_console",
+        SimpleNamespace(file=StringIO(), width=100),
+    )
+    mgr = terminal_display.SubAgentDisplayManager()
+    mgr.start("agent-1", "research")
+    mgr.add_call("agent-1", "▸ hf_papers  {\"operation\": \"search\"}")
+    mgr.clear("agent-1")
+    assert calls == []

tests/unit/test_dangling_tool_calls.py ADDED Viewed

	@@ -0,0 +1,121 @@

+"""Regression tests for `_patch_dangling_tool_calls`.
+Reproduces the failure mode behind observatory sessions 8dd2ce30 and
+59c9e678 (2026-04-25): a tool call cancelled mid-execution leaves an
+orphan ``tool_use`` in history; the user types a follow-up; Bedrock
+rejects the next request with HTTP 400 ``messages.N: tool_use ids were
+found without tool_result blocks immediately after``.
+"""
+from litellm import ChatCompletionMessageToolCall, Message
+from agent.context_manager.manager import ContextManager
+def _tool_call(call_id: str, name: str = "research") -> ChatCompletionMessageToolCall:
+    return ChatCompletionMessageToolCall(
+        id=call_id,
+        type="function",
+        function={"name": name, "arguments": "{}"},
+    )
+def _make_cm() -> ContextManager:
+    cm = ContextManager.__new__(ContextManager)
+    cm.system_prompt = "system"
+    cm.model_max_tokens = 100_000
+    cm.compact_size = 1_000
+    cm.running_context_usage = 0
+    cm.untouched_messages = 5
+    cm.items = [Message(role="system", content="system")]
+    return cm
+def test_orphan_tool_use_followed_by_user_message_is_patched():
+    cm = _make_cm()
+    cm.items.extend([
+        Message(role="user", content="Research X"),
+        Message(
+            role="assistant",
+            content=None,
+            tool_calls=[_tool_call("call_abc", "research")],
+        ),
+        Message(role="user", content="??"),
+    ])
+    msgs = cm.get_messages()
+    tool_msgs = [m for m in msgs if getattr(m, "role", None) == "tool"]
+    assert len(tool_msgs) == 1
+    assert tool_msgs[0].tool_call_id == "call_abc"
+    assert "interrupted" in (tool_msgs[0].content or "").lower() or "not executed" in (tool_msgs[0].content or "").lower()
+def test_no_orphan_means_no_stub():
+    cm = _make_cm()
+    cm.items.extend([
+        Message(role="user", content="Research X"),
+        Message(
+            role="assistant",
+            content=None,
+            tool_calls=[_tool_call("call_abc", "research")],
+        ),
+        Message(role="tool", content="ok", tool_call_id="call_abc", name="research"),
+    ])
+    cm.get_messages()
+    tool_msgs = [m for m in cm.items if getattr(m, "role", None) == "tool"]
+    assert len(tool_msgs) == 1
+    assert tool_msgs[0].content == "ok"
+def test_multiple_dangling_tool_calls_in_one_assistant_message_are_all_patched():
+    cm = _make_cm()
+    cm.items.extend([
+        Message(role="user", content="do two things"),
+        Message(
+            role="assistant",
+            content=None,
+            tool_calls=[
+                _tool_call("call_1", "research"),
+                _tool_call("call_2", "bash"),
+            ],
+        ),
+        Message(role="user", content="follow up"),
+    ])
+    cm.get_messages()
+    tool_ids = {
+        getattr(m, "tool_call_id", None)
+        for m in cm.items
+        if getattr(m, "role", None) == "tool"
+    }
+    assert tool_ids == {"call_1", "call_2"}
+def test_orphan_in_earlier_turn_still_gets_patched():
+    """Two-turn history where the FIRST turn was interrupted.
+    Old patcher stopped at the first user msg encountered while scanning
+    backwards, so this case never got fixed and Bedrock rejected.
+    """
+    cm = _make_cm()
+    cm.items.extend([
+        Message(role="user", content="turn 1"),
+        Message(
+            role="assistant",
+            content=None,
+            tool_calls=[_tool_call("call_old", "research")],
+        ),
+        Message(role="user", content="turn 2 — please retry"),
+        Message(
+            role="assistant",
+            content=None,
+            tool_calls=[_tool_call("call_new", "bash")],
+        ),
+        Message(role="tool", content="ok", tool_call_id="call_new", name="bash"),
+    ])
+    cm.get_messages()
+    tool_ids = {
+        getattr(m, "tool_call_id", None)
+        for m in cm.items
+        if getattr(m, "role", None) == "tool"
+    }
+    assert "call_old" in tool_ids
+    assert "call_new" in tool_ids

tests/unit/test_doom_loop_polling.py ADDED Viewed

	@@ -0,0 +1,96 @@

+"""Regression test for doom-loop false-positive on legitimate polling.
+Reproduces the failure mode in observatory sessions 40fcb414 ($32.59),
+8e90352e ($62.63), and 403178bf ($5.71) on 2026-04-25: the agent polled a
+long-running job with `bash sleep 300 && wc -l output` four times in a
+row. The arguments were byte-identical, but the results moved (27210 →
+36454 → 45770 → 55138 — actual progress). The detector hashed args only
+and false-fired DOOM LOOP, which made the agent abandon perfectly valid
+polling.
+After the fix the signature includes the tool result hash, so identical
+args + different results no longer trips the detector.
+"""
+from litellm import ChatCompletionMessageToolCall, Message
+from agent.core.doom_loop import check_for_doom_loop
+def _assistant(call_id: str, name: str, args: str) -> Message:
+    return Message(
+        role="assistant",
+        content=None,
+        tool_calls=[
+            ChatCompletionMessageToolCall(
+                id=call_id,
+                type="function",
+                function={"name": name, "arguments": args},
+            )
+        ],
+    )
+def _tool(call_id: str, name: str, content: str) -> Message:
+    return Message(role="tool", content=content, tool_call_id=call_id, name=name)
+_POLL_ARGS = '{"command": "sleep 300 && ls /app/images/ | wc -l"}'
+def test_polling_with_progressing_results_does_not_fire():
+    msgs = [
+        Message(role="user", content="run the job"),
+        _assistant("c1", "bash", _POLL_ARGS),
+        _tool("c1", "bash", "27210"),
+        _assistant("c2", "bash", _POLL_ARGS),
+        _tool("c2", "bash", "36454"),
+        _assistant("c3", "bash", _POLL_ARGS),
+        _tool("c3", "bash", "45770"),
+        _assistant("c4", "bash", _POLL_ARGS),
+        _tool("c4", "bash", "55138"),
+    ]
+    assert check_for_doom_loop(msgs) is None
+def test_truly_stuck_polling_with_identical_results_still_fires():
+    """If the same poll returns the same number, the job is genuinely
+    stuck and the detector SHOULD fire."""
+    msgs = [
+        _assistant("c1", "bash", _POLL_ARGS),
+        _tool("c1", "bash", "55138"),
+        _assistant("c2", "bash", _POLL_ARGS),
+        _tool("c2", "bash", "55138"),
+        _assistant("c3", "bash", _POLL_ARGS),
+        _tool("c3", "bash", "55138"),
+    ]
+    prompt = check_for_doom_loop(msgs)
+    assert prompt is not None
+    assert "DOOM LOOP" in prompt
+    assert "bash" in prompt
+def test_identical_calls_with_no_results_yet_still_fires():
+    """If three identical calls have no tool results (e.g. all cancelled
+    or errored before a result was recorded), treat as a real loop."""
+    msgs = [
+        _assistant("c1", "write", '{"path": "/tmp/x", "content": "..."}'),
+        _assistant("c2", "write", '{"path": "/tmp/x", "content": "..."}'),
+        _assistant("c3", "write", '{"path": "/tmp/x", "content": "..."}'),
+    ]
+    prompt = check_for_doom_loop(msgs)
+    assert prompt is not None
+    assert "DOOM LOOP" in prompt
+    assert "write" in prompt
+def test_different_args_does_not_fire():
+    msgs = [
+        _assistant("c1", "bash", '{"command": "ls /a"}'),
+        _tool("c1", "bash", "ok"),
+        _assistant("c2", "bash", '{"command": "ls /b"}'),
+        _tool("c2", "bash", "ok"),
+        _assistant("c3", "bash", '{"command": "ls /c"}'),
+        _tool("c3", "bash", "ok"),
+    ]
+    assert check_for_doom_loop(msgs) is None

tests/unit/test_hf_access.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from agent.core.hf_access import jobs_access_from_whoami
+def test_personal_pro_prefers_username_namespace():
+    access = jobs_access_from_whoami({
+        "name": "alice",
+        "plan": "pro",
+        "orgs": [],
+    })
+    assert access.plan == "pro"
+    assert access.eligible_namespaces == ["alice"]
+    assert access.default_namespace == "alice"
+def test_free_user_with_paid_org_uses_org_namespace():
+    access = jobs_access_from_whoami({
+        "name": "alice",
+        "plan": "free",
+        "orgs": [
+            {"name": "team-a", "plan": "team"},
+            {"name": "oss-friends", "plan": "free"},
+        ],
+    })
+    assert access.plan == "org"
+    assert access.personal_can_run_jobs is False
+    assert access.eligible_namespaces == ["team-a"]
+    assert access.default_namespace is None
+def test_free_user_without_paid_org_cannot_run_jobs():
+    access = jobs_access_from_whoami({
+        "name": "alice",
+        "plan": "free",
+        "orgs": [{"name": "community", "plan": "free"}],
+    })
+    assert access.plan == "free"
+    assert access.can_run_jobs is False
+    assert access.eligible_namespaces == []
+    assert access.default_namespace is None

tests/unit/test_llm_error_classification.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""Tests for LLM error classification helpers in agent.core.agent_loop.
+Covers two regressions on 2026-04-25:
+1. Non-Anthropic context overflow (Kimi 365k > 262k) was not classified as
+   ``_is_context_overflow_error``, so the recovery path didn't fire and
+   session 62ccfdcb died with 68 wasted compaction events.
+2. Bedrock TPM rate limit (`Too many tokens, please wait before trying
+   again.`) needs the longer rate-limit retry schedule. The old schedule
+   ([5, 15, 30] = 50s) burned through 6 sessions costing >$2,400 combined
+   on the same day.
+"""
+from agent.core.agent_loop import (
+    _MAX_LLM_RETRIES,
+    _LLM_RATE_LIMIT_RETRY_DELAYS,
+    _LLM_RETRY_DELAYS,
+    _is_context_overflow_error,
+    _is_rate_limit_error,
+    _is_transient_error,
+    _retry_delay_for,
+)
+# ── context overflow ────────────────────────────────────────────────────
+def test_kimi_prompt_too_long_is_context_overflow():
+    # Verbatim error text from session 62ccfdcb (2026-04-25, Kimi K2.6).
+    err = Exception(
+        "litellm.BadRequestError: OpenAIException - The prompt is too long: "
+        "365407, model maximum context length: 262143"
+    )
+    assert _is_context_overflow_error(err)
+def test_openai_context_length_exceeded_is_context_overflow():
+    err = Exception("Error: This model's maximum context length is 8192 tokens.")
+    assert _is_context_overflow_error(err)
+def test_random_error_is_not_context_overflow():
+    err = Exception("connection reset by peer")
+    assert not _is_context_overflow_error(err)
+# ── rate limit ──────────────────────────────────────────────────────────
+def test_bedrock_too_many_tokens_is_rate_limit():
+    # Verbatim from sessions b37a3823, c4d7a831, b63c4933 (2026-04-25).
+    err = Exception(
+        'litellm.RateLimitError: BedrockException - {"message":"Too many '
+        'tokens, please wait before trying again."}'
+    )
+    assert _is_rate_limit_error(err)
+    # Rate-limit errors are also classified as transient.
+    assert _is_transient_error(err)
+def test_429_is_rate_limit():
+    err = Exception("HTTP 429 Too Many Requests")
+    assert _is_rate_limit_error(err)
+def test_timeout_is_transient_but_not_rate_limit():
+    err = Exception("Request timed out after 600s")
+    assert _is_transient_error(err)
+    assert not _is_rate_limit_error(err)
+# ── retry schedule selection ────────────────────────────────────────────
+def test_rate_limit_uses_longer_schedule():
+    err = Exception("Too many tokens, please wait before trying again.")
+    delays = [_retry_delay_for(err, i) for i in range(len(_LLM_RATE_LIMIT_RETRY_DELAYS))]
+    assert delays == _LLM_RATE_LIMIT_RETRY_DELAYS
+    # Just past the schedule → None (stop retrying).
+    assert _retry_delay_for(err, len(_LLM_RATE_LIMIT_RETRY_DELAYS)) is None
+def test_other_transient_uses_short_schedule():
+    err = Exception("503 service unavailable")
+    delays = [_retry_delay_for(err, i) for i in range(len(_LLM_RETRY_DELAYS))]
+    assert delays == _LLM_RETRY_DELAYS
+    assert _retry_delay_for(err, len(_LLM_RETRY_DELAYS)) is None
+def test_non_transient_returns_none():
+    err = Exception("invalid request: bad parameter")
+    assert _retry_delay_for(err, 0) is None
+def test_rate_limit_total_budget_covers_bedrock_bucket_recovery():
+    """The whole point of the rate-limit schedule: total wait time should
+    exceed the ~60s Bedrock TPM bucket recovery window."""
+    assert len(_LLM_RATE_LIMIT_RETRY_DELAYS) == _MAX_LLM_RETRIES - 1
+    assert sum(_LLM_RATE_LIMIT_RETRY_DELAYS) > 60

tests/unit/test_llm_params.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from agent.core.llm_params import UnsupportedEffortError, _resolve_llm_params
+def test_openai_xhigh_effort_is_forwarded():
+    params = _resolve_llm_params(
+        "openai/gpt-5.5",
+        reasoning_effort="xhigh",
+        strict=True,
+    )
+    assert params["model"] == "openai/gpt-5.5"
+    assert params["reasoning_effort"] == "xhigh"
+def test_openai_max_effort_is_still_rejected():
+    try:
+        _resolve_llm_params(
+            "openai/gpt-5.4",
+            reasoning_effort="max",
+            strict=True,
+        )
+    except UnsupportedEffortError as exc:
+        assert "OpenAI doesn't accept effort='max'" in str(exc)
+    else:
+        raise AssertionError("Expected UnsupportedEffortError for max effort")

tests/unit/test_malformed_args_recovery.py ADDED Viewed

	@@ -0,0 +1,66 @@

+"""Regression test for the malformed-JSON loop in observatory session
+7750e82f (2026-04-25): GLM-5.1 produced six consecutive ``write`` calls
+whose ``arguments`` strings JSON-parse-failed (truncated mid-stream by
+the provider). The soft retry hint didn't move the model. The detector
+in ``_detect_repeated_malformed`` looks for the streak so the agent loop
+can inject a hard system-prompt forcing a different strategy.
+"""
+from litellm import Message
+from agent.core.agent_loop import _detect_repeated_malformed
+def _malformed_tool_msg(name: str, call_id: str) -> Message:
+    return Message(
+        role="tool",
+        content=(
+            f"ERROR: Tool call to '{name}' had malformed JSON arguments and "
+            f"was NOT executed. Retry with smaller content — for 'write', "
+            f"split into multiple smaller writes using 'edit'."
+        ),
+        tool_call_id=call_id,
+        name=name,
+    )
+def test_two_consecutive_malformed_same_tool_triggers():
+    items = [
+        Message(role="user", content="write a big plan"),
+        Message(role="assistant", content=None),
+        _malformed_tool_msg("write", "1"),
+        Message(role="assistant", content=None),
+        _malformed_tool_msg("write", "2"),
+    ]
+    assert _detect_repeated_malformed(items, threshold=2) == "write"
+def test_one_malformed_does_not_trigger():
+    items = [
+        Message(role="user", content="write a plan"),
+        Message(role="assistant", content=None),
+        _malformed_tool_msg("write", "1"),
+    ]
+    assert _detect_repeated_malformed(items, threshold=2) is None
+def test_two_malformed_different_tools_does_not_trigger():
+    items = [
+        Message(role="assistant", content=None),
+        _malformed_tool_msg("write", "1"),
+        Message(role="assistant", content=None),
+        _malformed_tool_msg("bash", "2"),
+    ]
+    assert _detect_repeated_malformed(items, threshold=2) is None
+def test_streak_broken_by_successful_tool_call_does_not_trigger():
+    items = [
+        Message(role="assistant", content=None),
+        _malformed_tool_msg("write", "1"),
+        Message(role="assistant", content=None),
+        Message(role="tool", content="ok", tool_call_id="2", name="write"),
+        Message(role="assistant", content=None),
+        _malformed_tool_msg("write", "3"),
+    ]
+    assert _detect_repeated_malformed(items, threshold=2) is None

tests/unit/test_sandbox_already_active_message.py ADDED Viewed

	@@ -0,0 +1,47 @@

+"""Regression test for sandbox_create not surfacing the hardware lockout.
+In observatory session d6f8454c (2026-04-25) the agent called
+sandbox_create 18 times across 11 distinct hardware tiers (a10g-large,
+a100-large, t4-small, cpu-upgrade, cpu-basic, zero-a10g, l4x1, t4-medium,
+a10g-small, l40sx1, …). Every call returned 'Sandbox already active' for
+the same sandbox, but the message did not say that hardware can't be
+changed by re-calling, so the agent thought "still pending, retry with a
+different flavor" and burned 17 useless turns.
+The fix makes the response explicit when the requested hardware differs
+from what's already active.
+"""
+import asyncio
+from types import SimpleNamespace
+from agent.tools.sandbox_tool import sandbox_create_handler
+def _session_with_sandbox():
+    sb = SimpleNamespace(
+        space_id="user/sandbox-abc123",
+        url="https://huggingface.co/spaces/user/sandbox-abc123",
+    )
+    return SimpleNamespace(sandbox=sb)
+def test_already_active_with_different_hw_warns_about_lockout():
+    session = _session_with_sandbox()
+    out, ok = asyncio.run(
+        sandbox_create_handler({"hardware": "a100-large"}, session=session)
+    )
+    assert ok is True
+    # The message should mention the lockout AND the requested flavor.
+    assert "cannot be changed" in out.lower()
+    assert "a100-large" in out
+    assert "delete" in out.lower()
+def test_already_active_no_hw_request_just_returns_handle():
+    session = _session_with_sandbox()
+    out, ok = asyncio.run(sandbox_create_handler({}, session=session))
+    assert ok is True
+    assert "user/sandbox-abc123" in out
+    # No spurious lockout note when the agent didn't request a flavor.
+    assert "cannot be changed" not in out.lower()

uv.lock CHANGED Viewed

@@ -228,6 +228,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/15/b3/9b1a8074496371342ec1e796a96f99c82c945a339cd81a8e73de28b4cf9e/anyio-4.11.0-py3-none-any.whl", hash = "sha256:0287e96f4d26d4149305414d4e3bc32f0dcd0862365a4bddea19d7a1ec38c4fc", size = 109097, upload-time = "2025-09-23T09:19:10.601Z" },
 ]
 [[package]]
 name = "attrs"
 version = "25.4.0"
@@ -992,78 +1004,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" },
 ]
-[[package]]
-name = "hf-agent"
-version = "0.1.0"
-source = { editable = "." }
-dependencies = [
-    { name = "boto3" },
-    { name = "datasets" },
-    { name = "fastapi" },
-    { name = "fastmcp" },
-    { name = "httpx" },
-    { name = "huggingface-hub" },
-    { name = "litellm" },
-    { name = "nbconvert" },
-    { name = "nbformat" },
-    { name = "prompt-toolkit" },
-    { name = "pydantic" },
-    { name = "python-dotenv" },
-    { name = "requests" },
-    { name = "rich" },
-    { name = "thefuzz" },
-    { name = "uvicorn", extra = ["standard"] },
-    { name = "websockets" },
-    { name = "whoosh" },
-]
-[package.optional-dependencies]
-all = [
-    { name = "datasets" },
-    { name = "inspect-ai" },
-    { name = "pandas" },
-    { name = "pytest" },
-    { name = "tenacity" },
-]
-dev = [
-    { name = "pytest" },
-]
-eval = [
-    { name = "datasets" },
-    { name = "inspect-ai" },
-    { name = "pandas" },
-    { name = "tenacity" },
-]
-[package.metadata]
-requires-dist = [
-    { name = "boto3", specifier = ">=1.35.0" },
-    { name = "datasets", specifier = ">=4.4.1" },
-    { name = "datasets", marker = "extra == 'eval'", specifier = ">=4.3.0" },
-    { name = "fastapi", specifier = ">=0.115.0" },
-    { name = "fastmcp", specifier = ">=3.2.0" },
-    { name = "hf-agent", extras = ["eval", "dev"], marker = "extra == 'all'" },
-    { name = "httpx", specifier = ">=0.27.0" },
-    { name = "huggingface-hub", specifier = ">=1.0.1" },
-    { name = "inspect-ai", marker = "extra == 'eval'", specifier = ">=0.3.149" },
-    { name = "litellm", specifier = ">=1.83.0" },
-    { name = "nbconvert", specifier = ">=7.16.6" },
-    { name = "nbformat", specifier = ">=5.10.4" },
-    { name = "pandas", marker = "extra == 'eval'", specifier = ">=2.3.3" },
-    { name = "prompt-toolkit", specifier = ">=3.0.0" },
-    { name = "pydantic", specifier = ">=2.12.3" },
-    { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.2" },
-    { name = "python-dotenv", specifier = ">=1.2.1" },
-    { name = "requests", specifier = ">=2.33.0" },
-    { name = "rich", specifier = ">=13.0.0" },
-    { name = "tenacity", marker = "extra == 'eval'", specifier = ">=8.0.0" },
-    { name = "thefuzz", specifier = ">=0.22.1" },
-    { name = "uvicorn", extras = ["standard"], specifier = ">=0.32.0" },
-    { name = "websockets", specifier = ">=13.0" },
-    { name = "whoosh", specifier = ">=2.7.4" },
-]
-provides-extras = ["eval", "dev", "all"]
 [[package]]
 name = "hf-xet"
 version = "1.2.0"
@@ -1827,6 +1767,83 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/9b/f7/4a5e785ec9fbd65146a27b6b70b6cdc161a66f2024e4b04ac06a67f5578b/mistune-3.2.0-py3-none-any.whl", hash = "sha256:febdc629a3c78616b94393c6580551e0e34cc289987ec6c35ed3f4be42d0eee1", size = 53598, upload-time = "2025-12-23T11:36:33.211Z" },
 ]
 [[package]]
 name = "mmh3"
 version = "5.2.0"
@@ -2775,6 +2792,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" },
 ]
 [[package]]
 name = "python-dateutil"
 version = "2.9.0.post0"
@@ -3619,6 +3649,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
 ]
 [[package]]
 name = "uc-micro-py"
 version = "1.0.3"

     { url = "https://files.pythonhosted.org/packages/15/b3/9b1a8074496371342ec1e796a96f99c82c945a339cd81a8e73de28b4cf9e/anyio-4.11.0-py3-none-any.whl", hash = "sha256:0287e96f4d26d4149305414d4e3bc32f0dcd0862365a4bddea19d7a1ec38c4fc", size = 109097, upload-time = "2025-09-23T09:19:10.601Z" },
 ]
+[[package]]
+name = "apscheduler"
+version = "3.11.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "tzlocal" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/07/12/3e4389e5920b4c1763390c6d371162f3784f86f85cd6d6c1bfe68eef14e2/apscheduler-3.11.2.tar.gz", hash = "sha256:2a9966b052ec805f020c8c4c3ae6e6a06e24b1bf19f2e11d91d8cca0473eef41", size = 108683, upload-time = "2025-12-22T00:39:34.884Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/9f/64/2e54428beba8d9992aa478bb8f6de9e4ecaa5f8f513bcfd567ed7fb0262d/apscheduler-3.11.2-py3-none-any.whl", hash = "sha256:ce005177f741409db4e4dd40a7431b76feb856b9dd69d57e0da49d6715bfd26d", size = 64439, upload-time = "2025-12-22T00:39:33.303Z" },
+]
 [[package]]
 name = "attrs"
 version = "25.4.0"
     { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" },
 ]
 [[package]]
 name = "hf-xet"
 version = "1.2.0"
     { url = "https://files.pythonhosted.org/packages/9b/f7/4a5e785ec9fbd65146a27b6b70b6cdc161a66f2024e4b04ac06a67f5578b/mistune-3.2.0-py3-none-any.whl", hash = "sha256:febdc629a3c78616b94393c6580551e0e34cc289987ec6c35ed3f4be42d0eee1", size = 53598, upload-time = "2025-12-23T11:36:33.211Z" },
 ]
+[[package]]
+name = "ml-intern"
+version = "0.1.0"
+source = { editable = "." }
+dependencies = [
+    { name = "apscheduler" },
+    { name = "boto3" },
+    { name = "datasets" },
+    { name = "fastapi" },
+    { name = "fastmcp" },
+    { name = "httpx" },
+    { name = "huggingface-hub" },
+    { name = "litellm" },
+    { name = "nbconvert" },
+    { name = "nbformat" },
+    { name = "prompt-toolkit" },
+    { name = "pydantic" },
+    { name = "python-dotenv" },
+    { name = "requests" },
+    { name = "rich" },
+    { name = "thefuzz" },
+    { name = "uvicorn", extra = ["standard"] },
+    { name = "websockets" },
+    { name = "whoosh" },
+]
+[package.optional-dependencies]
+all = [
+    { name = "datasets" },
+    { name = "inspect-ai" },
+    { name = "pandas" },
+    { name = "pytest" },
+    { name = "pytest-asyncio" },
+    { name = "tenacity" },
+]
+dev = [
+    { name = "pytest" },
+    { name = "pytest-asyncio" },
+]
+eval = [
+    { name = "datasets" },
+    { name = "inspect-ai" },
+    { name = "pandas" },
+    { name = "tenacity" },
+]
+[package.metadata]
+requires-dist = [
+    { name = "apscheduler", specifier = ">=3.10,<4" },
+    { name = "boto3", specifier = ">=1.35.0" },
+    { name = "datasets", specifier = ">=4.4.1" },
+    { name = "datasets", marker = "extra == 'eval'", specifier = ">=4.3.0" },
+    { name = "fastapi", specifier = ">=0.115.0" },
+    { name = "fastmcp", specifier = ">=3.2.0" },
+    { name = "httpx", specifier = ">=0.27.0" },
+    { name = "huggingface-hub", specifier = ">=1.0.1" },
+    { name = "inspect-ai", marker = "extra == 'eval'", specifier = ">=0.3.149" },
+    { name = "litellm", specifier = ">=1.83.0" },
+    { name = "ml-intern", extras = ["eval", "dev"], marker = "extra == 'all'" },
+    { name = "nbconvert", specifier = ">=7.16.6" },
+    { name = "nbformat", specifier = ">=5.10.4" },
+    { name = "pandas", marker = "extra == 'eval'", specifier = ">=2.3.3" },
+    { name = "prompt-toolkit", specifier = ">=3.0.0" },
+    { name = "pydantic", specifier = ">=2.12.3" },
+    { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.2" },
+    { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.26.0" },
+    { name = "python-dotenv", specifier = ">=1.2.1" },
+    { name = "requests", specifier = ">=2.33.0" },
+    { name = "rich", specifier = ">=13.0.0" },
+    { name = "tenacity", marker = "extra == 'eval'", specifier = ">=8.0.0" },
+    { name = "thefuzz", specifier = ">=0.22.1" },
+    { name = "uvicorn", extras = ["standard"], specifier = ">=0.32.0" },
+    { name = "websockets", specifier = ">=13.0" },
+    { name = "whoosh", specifier = ">=2.7.4" },
+]
+provides-extras = ["eval", "dev", "all"]
 [[package]]
 name = "mmh3"
 version = "5.2.0"
     { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" },
 ]
+[[package]]
+name = "pytest-asyncio"
+version = "1.3.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pytest" },
+    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/90/2c/8af215c0f776415f3590cac4f9086ccefd6fd463befeae41cd4d3f193e5a/pytest_asyncio-1.3.0.tar.gz", hash = "sha256:d7f52f36d231b80ee124cd216ffb19369aa168fc10095013c6b014a34d3ee9e5", size = 50087, upload-time = "2025-11-10T16:07:47.256Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e5/35/f8b19922b6a25bc0880171a2f1a003eaeb93657475193ab516fd87cac9da/pytest_asyncio-1.3.0-py3-none-any.whl", hash = "sha256:611e26147c7f77640e6d0a92a38ed17c3e9848063698d5c93d5aa7aa11cebff5", size = 15075, upload-time = "2025-11-10T16:07:45.537Z" },
+]
 [[package]]
 name = "python-dateutil"
 version = "2.9.0.post0"
     { url = "https://files.pythonhosted.org/packages/5c/23/c7abc0ca0a1526a0774eca151daeb8de62ec457e77262b66b359c3c7679e/tzdata-2025.2-py2.py3-none-any.whl", hash = "sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8", size = 347839, upload-time = "2025-03-23T13:54:41.845Z" },
 ]
+[[package]]
+name = "tzlocal"
+version = "5.3.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "tzdata", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/8b/2e/c14812d3d4d9cd1773c6be938f89e5735a1f11a9f184ac3639b93cef35d5/tzlocal-5.3.1.tar.gz", hash = "sha256:cceffc7edecefea1f595541dbd6e990cb1ea3d19bf01b2809f362a03dd7921fd", size = 30761, upload-time = "2025-03-05T21:17:41.549Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c2/14/e2a54fabd4f08cd7af1c07030603c3356b74da07f7cc056e600436edfa17/tzlocal-5.3.1-py3-none-any.whl", hash = "sha256:eb1a66c3ef5847adf7a834f1be0800581b683b5608e74f86ecbcef8ab91bb85d", size = 18026, upload-time = "2025-03-05T21:17:39.857Z" },
+]
 [[package]]
 name = "uc-micro-py"
 version = "1.0.3"