Spaces:

umanggarg
/

cartographer

Sleeping

umanggarg Claude Sonnet 4.6 commited on Apr 4

Commit

c0f8586

1 Parent(s): 1f180ad

Add model selector: dropdown UI + /agent/models endpoint

- GET /agent/models returns catalog of available models with speed/note metadata
- POST /agent/stream now accepts model_id to override the default priority chain
- AgentService.stream() temporarily swaps client/provider/model per-request,
restoring the default in a finally block so the priority chain is preserved
- Fixed stream() indentation bug: for loop body was outside the try block
- Frontend: model selector dropdown in agent mode footer (like Claude's UI)
shows active model name + speed badge, dropdown lists all models with notes
- selectedModelId persisted to localStorage across page loads

Tested: /agent/models returns correct JSON; /agent/stream with model_id routes
to Cerebras Qwen3-235B and reports correct model in done event.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (6) hide show

backend/main.py +35 -2
backend/models/schemas.py +5 -0
backend/services/agent.py +193 -124
ui/src/App.jsx +85 -3
ui/src/api.js +9 -2
ui/src/index.css +145 -0

backend/main.py CHANGED Viewed

@@ -57,7 +57,7 @@ from backend.models.schemas import (
 from backend.config import settings
 from backend.services.ingestion_service import IngestionService
 from backend.services.generation import GenerationService, classify_query
-from backend.services.agent import AgentService
 from backend.services.diagram_service import DiagramService
 from backend.services.repo_map_service import RepoMapService
 from backend.mcp_server import mcp, init_services as init_mcp_services
@@ -801,10 +801,42 @@ class AgentStreamRequest(BaseModel):
     """Request body for POST /agent/stream — agentic RAG with conversation history."""
     question: str
     repo:     str | None = None
     # Conversation history: prior [{role, content}] turns for follow-up questions.
     history:  list[dict] = []
 @app.post("/agent/stream", tags=["agent"])
 async def agent_stream(request: AgentStreamRequest):
     """
@@ -825,6 +857,7 @@ async def agent_stream(request: AgentStreamRequest):
     svc      = _agent_service  # may be None if no API key configured
     question = request.question
     repo     = request.repo
     history  = request.history[-10:]  # cap at 5 exchanges
     async def event_stream():
@@ -851,7 +884,7 @@ async def agent_stream(request: AgentStreamRequest):
             async def _producer():
                 try:
-                    async for event in svc.stream(question, repo_filter=repo, history=history):
                         await queue.put(("event", event))
                     await queue.put(("done", None))
                 except Exception as exc:

 from backend.config import settings
 from backend.services.ingestion_service import IngestionService
 from backend.services.generation import GenerationService, classify_query
+from backend.services.agent import AgentService, AGENT_MODELS
 from backend.services.diagram_service import DiagramService
 from backend.services.repo_map_service import RepoMapService
 from backend.mcp_server import mcp, init_services as init_mcp_services
     """Request body for POST /agent/stream — agentic RAG with conversation history."""
     question: str
     repo:     str | None = None
+    model_id: str | None = None   # catalog ID from /agent/models; None = auto priority chain
     # Conversation history: prior [{role, content}] turns for follow-up questions.
     history:  list[dict] = []
+@app.get("/agent/models", tags=["agent"])
+async def agent_models():
+    """
+    Return the list of available agent models with metadata for the model selector UI.
+    Each entry has:
+      id:          unique catalog ID sent back as model_id in /agent/stream requests
+      name:        display name shown in the UI
+      provider:    which API this model is served by
+      speed:       "fast" | "slow" — used to show a visual indicator
+      speed_label: human-readable time estimate (e.g. "~40s")
+      note:        one-sentence tradeoff description shown in the tooltip / expanded row
+      available:   whether the required API key is configured on this server
+    """
+    from backend.config import settings
+    result = []
+    for m in AGENT_MODELS:
+        key_attr = m.get("requires", "")
+        available = bool(getattr(settings, key_attr, ""))
+        result.append({
+            "id":          m["id"],
+            "name":        m["name"],
+            "provider":    m["provider"],
+            "speed":       m["speed"],
+            "speed_label": m["speed_label"],
+            "note":        m["note"],
+            "available":   available,
+        })
+    return {"models": result}
 @app.post("/agent/stream", tags=["agent"])
 async def agent_stream(request: AgentStreamRequest):
     """
     svc      = _agent_service  # may be None if no API key configured
     question = request.question
     repo     = request.repo
+    model_id = request.model_id
     history  = request.history[-10:]  # cap at 5 exchanges
     async def event_stream():
             async def _producer():
                 try:
+                    async for event in svc.stream(question, repo_filter=repo, history=history, model_id=model_id):
                         await queue.put(("event", event))
                     await queue.put(("done", None))
                 except Exception as exc:

backend/models/schemas.py CHANGED Viewed

@@ -172,6 +172,11 @@ class AgentRequest(BaseModel):
         default=None,
         description="Restrict search to a specific repo slug (e.g. 'karpathy/micrograd')",
     )
 class AgentResponse(BaseModel):

         default=None,
         description="Restrict search to a specific repo slug (e.g. 'karpathy/micrograd')",
     )
+    model_id: Optional[str] = Field(
+        default=None,
+        description="Model catalog ID to use (e.g. 'cerebras/qwen3-235b'). Defaults to the server's priority chain.",
+    )
+    history: list = Field(default_factory=list, description="Prior conversation turns")
 class AgentResponse(BaseModel):

backend/services/agent.py CHANGED Viewed

@@ -297,6 +297,54 @@ def _sources_from_search_result(result_text: str, fallback_repo: str | None) ->
 # OpenRouter: free model with confirmed tool-calling support.
 # Required headers: HTTP-Referer (for attribution) and X-Title (app name).
 # Without HTTP-Referer, free tier access may be denied.
 _OPENROUTER_MODEL = "qwen/qwen3-coder:free"
 # Groq models tried in order when the primary is over capacity or decommissioned.
@@ -472,8 +520,8 @@ class AgentService:
                 base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
             )
             self._provider = "gemini"
-            self._model    = "gemini-2.0-flash"
-            print("AgentService: using Google Gemini (gemini-2.0-flash) via MCP tools")
         elif settings.openrouter_api_key:
             self._client   = _openrouter_client(settings.openrouter_api_key)
             self._provider = "openrouter"
@@ -557,7 +605,11 @@ class AgentService:
         }
     async def stream(
-        self, question: str, repo_filter: str | None = None, history: list[dict] | None = None
     ) -> AsyncIterator[dict]:
         """
         Stream agent progress as an async generator.
@@ -581,129 +633,146 @@ class AgentService:
           we re-run with stream=True so tokens arrive in real time.
           This is one extra LLM call but delivers genuine streaming UX.
         """
-        # Discover tools from MCP server (cached after first call)
-        mcp_tools = await self.mcp.list_tools()
-        messages  = self._build_initial_messages(question, repo_filter, history)
-        # Clear session notes from any previous run so this conversation starts fresh.
-        # Note: we import here to avoid circular imports at module load time.
-        from backend.mcp_server import clear_notes
-        clear_notes()
-        # Loop detection: skip duplicate tool calls in the stream path too.
-        seen_calls: set[tuple] = set()
-        # Collect source references from tool calls for the sources panel.
-        # Keyed by (repo, filepath, start_line) to deduplicate across iterations.
-        collected_sources: dict[tuple, dict] = {}
-        for iteration in range(self.MAX_ITERATIONS):
-            # Run sync LLM call in thread pool — doesn't block the event loop
-            # Pass raw mcp_tools so _call_llm can reformat if provider switches mid-run
-            step = await asyncio.to_thread(self._call_llm, messages, mcp_tools)
-            if step["done"]:
-                # Stream the final answer with real token-by-token delivery.
-                # We pass messages (with all tool results) to the streaming call
-                # and tell the LLM not to use tools (tool_choice="none") so it
-                # goes straight to answering.
-                async for token in self._stream_final_answer(messages, mcp_tools):
-                    yield {"type": "token", "text": token}
-                # Emit sources collected across all tool calls before done event
-                if collected_sources:
-                    yield {"type": "sources", "sources": list(collected_sources.values())}
-                yield {"type": "done", "iterations": iteration + 1, "model": self._model}
-                return
-            messages.append(step["assistant_message"])
-            # Emit any pre-tool reasoning text the LLM produced before calling tools.
-            # This lets the UI show "thought bubbles" in the trace timeline —
-            # the user sees WHY each tool was chosen, not just WHAT was called.
-            thought = _extract_thought(step["assistant_message"], self._provider)
-            if thought:
-                yield {"type": "thought", "text": thought}
-            # ── Parallel tool execution ───────────────────────────────────────
-            # The LLM may return multiple tool calls in one turn (e.g. search_code
-            # called 2-3 times for different query angles simultaneously).
-            # Instead of serial execution, we:
-            #   1. Emit tool_call events for all new (non-duplicate) calls upfront
-            #   2. Run them concurrently with asyncio.gather
-            #   3. Emit tool_result events for all after they complete
-            #
-            # This reduces latency proportionally to the number of parallel calls
-            # (3 serial 500ms searches → 1 parallel 500ms round trip).
-            # Separate new calls from duplicates
-            new_calls: list[dict] = []
-            for tc in step["tool_calls"]:
-                call_key = (tc["name"], tuple(sorted(tc["input"].items())))
-                if call_key in seen_calls:
-                    dup_msg = f"[Skipped duplicate {tc['name']} call — already ran with these arguments]"
-                    yield {"type": "tool_result", "tool": tc["name"], "output": dup_msg}
-                    messages.append(self._build_tool_result(tc["id"], tc["name"], dup_msg))
-                else:
-                    seen_calls.add(call_key)
-                    new_calls.append(tc)
-                    # Emit tool_call events immediately so UI shows them in parallel
-                    yield {"type": "tool_call", "tool": tc["name"], "input": tc["input"]}
-            if not new_calls:
-                continue
-            # Execute all new calls concurrently — MCP calls are async HTTP round trips
-            async def _run_tool(tc: dict) -> str:
-                try:
-                    return await self.mcp.call_tool(tc["name"], tc["input"])
-                except Exception as e:
-                    return f"Tool error: {e}"
-            parallel_results = await asyncio.gather(*[_run_tool(tc) for tc in new_calls])
-            # Process results in the same order as the calls
-            for tc, result in zip(new_calls, parallel_results):
-                # Collect source metadata for the sources panel
-                if tc["name"] == "get_file_chunk":
-                    src = _source_from_chunk_call(tc["input"], result)
-                    if src:
-                        key = (src["repo"], src["filepath"], src["start_line"])
-                        collected_sources[key] = src
-                if tc["name"] in ("search_code", "find_callers", "search_symbol") and not result.startswith("No results"):
-                    for src in _sources_from_search_result(result, tc["input"].get("repo") or repo_filter):
-                        key = (src["repo"], src["filepath"], src["start_line"])
-                        collected_sources[key] = src
-                # read_file returns a whole file — record it as a single source entry
-                if tc["name"] == "read_file" and tc["input"].get("filepath"):
-                    repo     = tc["input"].get("repo", repo_filter or "")
-                    filepath = tc["input"]["filepath"]
-                    key = (repo, filepath, 0)
-                    if key not in collected_sources:
-                        ext = "." + filepath.rsplit(".", 1)[-1].lower() if "." in filepath else ""
-                        lang = {"py": "python", "js": "javascript", "ts": "typescript",
-                                "go": "go", "rs": "rust", "java": "java"}.get(ext.lstrip("."), "text")
-                        collected_sources[key] = {
-                            "repo": repo, "filepath": filepath, "language": lang,
-                            "chunk_type": "file", "name": filepath.rsplit("/", 1)[-1],
-                            "start_line": 1, "end_line": result.count("\n"),
-                            "score": 1.0, "text": "",
-                        }
-                display = result[:500] + "…" if len(result) > 500 else result
-                yield {"type": "tool_result", "tool": tc["name"], "output": display}
-                messages.append(self._build_tool_result(tc["id"], tc["name"], result))
-        # MAX_ITERATIONS hit — LLM never voluntarily stopped, but it has gathered
-        # context from all its tool calls. Force a final answer from that context
-        # rather than returning silence.
-        async for token in self._stream_final_answer(messages, mcp_tools):
-            yield {"type": "token", "text": token}
-        # Emit any collected sources even when we hit the iteration cap
-        if collected_sources:
-            yield {"type": "sources", "sources": list(collected_sources.values())}
-        yield {"type": "done", "iterations": self.MAX_ITERATIONS, "model": self._model}
     async def _stream_final_answer(self, messages: list, mcp_tools: list) -> AsyncIterator[str]:
         """
@@ -813,8 +882,8 @@ class AgentService:
                 base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
             )
             self._provider = "gemini"
-            self._model    = "gemini-2.0-flash"
-            print("AgentService: Cerebras limit hit — switched to Gemini (gemini-2.0-flash)")
             return True
         if self._provider in ("cerebras", "gemini") and settings.openrouter_api_key:
             self._client   = _openrouter_client(settings.openrouter_api_key)

 # OpenRouter: free model with confirmed tool-calling support.
 # Required headers: HTTP-Referer (for attribution) and X-Title (app name).
 # Without HTTP-Referer, free tier access may be denied.
+# ── Model catalog ─────────────────────────────────────────────────────────────
+# Each entry describes a model the user can select from the UI.
+# "requires" is the settings key that must be non-empty for this model to appear.
+# "provider" must match the strings used in _call_groq / _call_anthropic routing.
+AGENT_MODELS: list[dict] = [
+    {
+        "id":          "cerebras/qwen3-235b",
+        "name":        "Qwen3 235B",
+        "provider":    "cerebras",
+        "model":       "qwen-3-235b-a22b-instruct-2507",
+        "requires":    "cerebras_api_key",
+        "speed":       "fast",
+        "speed_label": "~40s",
+        "note":        "Best balance. Fast inference (1400 tok/s), strong tool use, generous free quota.",
+    },
+    {
+        "id":          "google/gemma4-31b",
+        "name":        "Gemma 4 31B",
+        "provider":    "gemini",
+        "model":       "gemma-4-31b-it",
+        "requires":    "gemini_api_key",
+        "speed":       "slow",
+        "speed_label": "~90s",
+        "note":        "Highest quality. Reads actual source files. Slower but thorough. Free via AI Studio.",
+    },
+    {
+        "id":          "google/gemini-flash",
+        "name":        "Gemini 2.0 Flash",
+        "provider":    "gemini",
+        "model":       "gemini-2.0-flash",
+        "requires":    "gemini_api_key",
+        "speed":       "fast",
+        "speed_label": "~15s",
+        "note":        "Fastest. Lower quality than Gemma 4. 1500 req/day free limit.",
+    },
+]
+def _make_client(model_entry: dict):
+    """Instantiate the right API client for a model catalog entry."""
+    from openai import OpenAI
+    if model_entry["provider"] == "cerebras":
+        return OpenAI(api_key=settings.cerebras_api_key, base_url="https://api.cerebras.ai/v1")
+    else:  # gemini
+        return OpenAI(
+            api_key=settings.gemini_api_key,
+            base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
+        )
 _OPENROUTER_MODEL = "qwen/qwen3-coder:free"
 # Groq models tried in order when the primary is over capacity or decommissioned.
                 base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
             )
             self._provider = "gemini"
+            self._model    = "gemma-4-31b-it"
+            print("AgentService: using Gemma 4 31B (gemma-4-31b-it) via MCP tools")
         elif settings.openrouter_api_key:
             self._client   = _openrouter_client(settings.openrouter_api_key)
             self._provider = "openrouter"
         }
     async def stream(
+        self,
+        question: str,
+        repo_filter: str | None = None,
+        history: list[dict] | None = None,
+        model_id: str | None = None,
     ) -> AsyncIterator[dict]:
         """
         Stream agent progress as an async generator.
           we re-run with stream=True so tokens arrive in real time.
           This is one extra LLM call but delivers genuine streaming UX.
         """
+        # ── Per-request model override ────────────────────────────────────────
+        # If the user selected a specific model in the UI, temporarily swap to it.
+        # We save/restore self._client/provider/model in a finally block so the
+        # default priority chain is preserved for the next request.
+        _orig = (self._client, self._provider, self._model)
+        entry = next((m for m in AGENT_MODELS if m["id"] == model_id), None)
+        if entry:
+            self._client   = _make_client(entry)
+            self._provider = entry["provider"]
+            self._model    = entry["model"]
+        try:
+            # Discover tools from MCP server (cached after first call)
+            mcp_tools = await self.mcp.list_tools()
+            messages  = self._build_initial_messages(question, repo_filter, history)
+            # Clear session notes from any previous run so this conversation starts fresh.
+            # Note: we import here to avoid circular imports at module load time.
+            from backend.mcp_server import clear_notes
+            clear_notes()
+            # Loop detection: skip duplicate tool calls in the stream path too.
+            seen_calls: set[tuple] = set()
+            # Collect source references from tool calls for the sources panel.
+            # Keyed by (repo, filepath, start_line) to deduplicate across iterations.
+            collected_sources: dict[tuple, dict] = {}
+            for iteration in range(self.MAX_ITERATIONS):
+                # Run sync LLM call in thread pool — doesn't block the event loop
+                # Pass raw mcp_tools so _call_llm can reformat if provider switches mid-run
+                step = await asyncio.to_thread(self._call_llm, messages, mcp_tools)
+                if step["done"]:
+                    # Stream the final answer with real token-by-token delivery.
+                    # We pass messages (with all tool results) to the streaming call
+                    # and tell the LLM not to use tools (tool_choice="none") so it
+                    # goes straight to answering.
+                    async for token in self._stream_final_answer(messages, mcp_tools):
+                        yield {"type": "token", "text": token}
+                    # Emit sources collected across all tool calls before done event
+                    if collected_sources:
+                        yield {"type": "sources", "sources": list(collected_sources.values())}
+                    yield {"type": "done", "iterations": iteration + 1, "model": self._model}
+                    return
+                messages.append(step["assistant_message"])
+                # Emit any pre-tool reasoning text the LLM produced before calling tools.
+                # This lets the UI show "thought bubbles" in the trace timeline —
+                # the user sees WHY each tool was chosen, not just WHAT was called.
+                thought = _extract_thought(step["assistant_message"], self._provider)
+                if thought:
+                    yield {"type": "thought", "text": thought}
+                # ── Parallel tool execution ───────────────────────────────────────
+                # The LLM may return multiple tool calls in one turn (e.g. search_code
+                # called 2-3 times for different query angles simultaneously).
+                # Instead of serial execution, we:
+                #   1. Emit tool_call events for all new (non-duplicate) calls upfront
+                #   2. Run them concurrently with asyncio.gather
+                #   3. Emit tool_result events for all after they complete
+                #
+                # This reduces latency proportionally to the number of parallel calls
+                # (3 serial 500ms searches → 1 parallel 500ms round trip).
+                # Separate new calls from duplicates
+                new_calls: list[dict] = []
+                for tc in step["tool_calls"]:
+                    call_key = (tc["name"], tuple(sorted(tc["input"].items())))
+                    if call_key in seen_calls:
+                        dup_msg = f"[Skipped duplicate {tc['name']} call — already ran with these arguments]"
+                        yield {"type": "tool_result", "tool": tc["name"], "output": dup_msg}
+                        messages.append(self._build_tool_result(tc["id"], tc["name"], dup_msg))
+                    else:
+                        seen_calls.add(call_key)
+                        new_calls.append(tc)
+                        # Emit tool_call events immediately so UI shows them in parallel
+                        yield {"type": "tool_call", "tool": tc["name"], "input": tc["input"]}
+                if not new_calls:
+                    continue
+                # Execute all new calls concurrently — MCP calls are async HTTP round trips
+                async def _run_tool(tc: dict) -> str:
+                    try:
+                        return await self.mcp.call_tool(tc["name"], tc["input"])
+                    except Exception as e:
+                        return f"Tool error: {e}"
+                parallel_results = await asyncio.gather(*[_run_tool(tc) for tc in new_calls])
+                # Process results in the same order as the calls
+                for tc, result in zip(new_calls, parallel_results):
+                    # Collect source metadata for the sources panel
+                    if tc["name"] == "get_file_chunk":
+                        src = _source_from_chunk_call(tc["input"], result)
+                        if src:
+                            key = (src["repo"], src["filepath"], src["start_line"])
+                            collected_sources[key] = src
+                    if tc["name"] in ("search_code", "find_callers", "search_symbol") and not result.startswith("No results"):
+                        for src in _sources_from_search_result(result, tc["input"].get("repo") or repo_filter):
+                            key = (src["repo"], src["filepath"], src["start_line"])
+                            collected_sources[key] = src
+                    # read_file returns a whole file — record it as a single source entry
+                    if tc["name"] == "read_file" and tc["input"].get("filepath"):
+                        repo     = tc["input"].get("repo", repo_filter or "")
+                        filepath = tc["input"]["filepath"]
+                        key = (repo, filepath, 0)
+                        if key not in collected_sources:
+                            ext = "." + filepath.rsplit(".", 1)[-1].lower() if "." in filepath else ""
+                            lang = {"py": "python", "js": "javascript", "ts": "typescript",
+                                    "go": "go", "rs": "rust", "java": "java"}.get(ext.lstrip("."), "text")
+                            collected_sources[key] = {
+                                "repo": repo, "filepath": filepath, "language": lang,
+                                "chunk_type": "file", "name": filepath.rsplit("/", 1)[-1],
+                                "start_line": 1, "end_line": result.count("\n"),
+                                "score": 1.0, "text": "",
+                            }
+                    display = result[:500] + "…" if len(result) > 500 else result
+                    yield {"type": "tool_result", "tool": tc["name"], "output": display}
+                    messages.append(self._build_tool_result(tc["id"], tc["name"], result))
+            # MAX_ITERATIONS hit — LLM never voluntarily stopped, but it has gathered
+            # context from all its tool calls. Force a final answer from that context
+            # rather than returning silence.
+            async for token in self._stream_final_answer(messages, mcp_tools):
+                yield {"type": "token", "text": token}
+            # Emit any collected sources even when we hit the iteration cap
+            if collected_sources:
+                yield {"type": "sources", "sources": list(collected_sources.values())}
+            yield {"type": "done", "iterations": self.MAX_ITERATIONS, "model": self._model}
+        finally:
+            # Restore original client/provider/model so the next request uses the
+            # default priority chain regardless of what model was selected this time.
+            self._client, self._provider, self._model = _orig
     async def _stream_final_answer(self, messages: list, mcp_tools: list) -> AsyncIterator[str]:
         """
                 base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
             )
             self._provider = "gemini"
+            self._model    = "gemma-4-31b-it"
+            print("AgentService: Cerebras limit hit — switched to Gemma 4 31B (gemma-4-31b-it)")
             return True
         if self._provider in ("cerebras", "gemini") and settings.openrouter_api_key:
             self._client   = _openrouter_client(settings.openrouter_api_key)

ui/src/App.jsx CHANGED Viewed

@@ -2,7 +2,7 @@ import { useState, useEffect, useRef, useCallback } from "react";
 import Sidebar from "./components/Sidebar";
 import Message from "./components/Message";
 import DiagramView from "./components/DiagramView";
-import { fetchRepos, streamQuery, streamAgentQuery, fetchMcpStatus, fetchMcpPrompt } from "./api";
 export default function App() {
   const [repos, setRepos]           = useState([]);
@@ -31,6 +31,14 @@ export default function App() {
   const [promptMenu, setPromptMenu]   = useState(false);   // dropdown visible
   const [promptFilter, setPromptFilter] = useState("");    // text after "/"
   const bottomRef           = useRef(null);
   const scrollRef           = useRef(null);
   const latestAssistantRef  = useRef(null); // top of the current streaming assistant message
@@ -83,6 +91,33 @@ export default function App() {
   useEffect(() => { streamingRef.current = streaming; }, [streaming]);
   // Persist agent mode preference across page loads
   useEffect(() => { localStorage.setItem('ghrc_agentMode', agentMode); }, [agentMode]);
   // Keep handleSubmitRef pointing at the latest handleSubmit (avoids stale closures
   // in the rate-limit countdown which captures this ref via closure).
   // We update it on every render so it always has the current state in scope.
@@ -390,6 +425,7 @@ export default function App() {
       stop = streamAgentQuery({
         question,
         repo: activeRepo,
         history,
         onThought: (text) => {
           // Append a thought entry to the trace — rendered as a reasoning bubble
@@ -846,9 +882,55 @@ export default function App() {
                   <div className="input-hint" aria-hidden="true">{isMac ? "⌘K" : "Ctrl+K"}</div>
                 )}
               </div>
-              {/* Agent mode indicator — small label below the textarea row */}
               {agentMode && (
-                <div className="input-mode-badge" aria-hidden="true" title="Agent mode — runs the ReAct loop (Reason + Act): searches the codebase, reads the result, decides if it needs more context, then searches again. The same pattern production agents use.">✦ Agent</div>
               )}
             </div>
           </>

 import Sidebar from "./components/Sidebar";
 import Message from "./components/Message";
 import DiagramView from "./components/DiagramView";
+import { fetchRepos, streamQuery, streamAgentQuery, fetchMcpStatus, fetchMcpPrompt, fetchAgentModels } from "./api";
 export default function App() {
   const [repos, setRepos]           = useState([]);
   const [promptMenu, setPromptMenu]   = useState(false);   // dropdown visible
   const [promptFilter, setPromptFilter] = useState("");    // text after "/"
+  // Model selector: available models fetched from /agent/models
+  const [agentModels, setAgentModels] = useState([]);
+  const [selectedModelId, setSelectedModelId] = useState(
+    () => localStorage.getItem('ghrc_selectedModel') || null
+  );
+  const [modelMenuOpen, setModelMenuOpen] = useState(false);
+  const modelMenuRef = useRef(null);
   const bottomRef           = useRef(null);
   const scrollRef           = useRef(null);
   const latestAssistantRef  = useRef(null); // top of the current streaming assistant message
   useEffect(() => { streamingRef.current = streaming; }, [streaming]);
   // Persist agent mode preference across page loads
   useEffect(() => { localStorage.setItem('ghrc_agentMode', agentMode); }, [agentMode]);
+  // Persist selected model
+  useEffect(() => {
+    if (selectedModelId) localStorage.setItem('ghrc_selectedModel', selectedModelId);
+    else localStorage.removeItem('ghrc_selectedModel');
+  }, [selectedModelId]);
+  // Fetch available agent models once on mount
+  useEffect(() => {
+    fetchAgentModels().then(models => {
+      setAgentModels(models);
+      // If no model selected yet, default to the first available one
+      setSelectedModelId(prev => {
+        if (prev && models.some(m => m.id === prev)) return prev;
+        const first = models.find(m => m.available);
+        return first ? first.id : null;
+      });
+    });
+  }, []);
+  // Close model menu when clicking outside
+  useEffect(() => {
+    function onClickOutside(e) {
+      if (modelMenuRef.current && !modelMenuRef.current.contains(e.target)) {
+        setModelMenuOpen(false);
+      }
+    }
+    document.addEventListener("mousedown", onClickOutside);
+    return () => document.removeEventListener("mousedown", onClickOutside);
+  }, []);
   // Keep handleSubmitRef pointing at the latest handleSubmit (avoids stale closures
   // in the rate-limit countdown which captures this ref via closure).
   // We update it on every render so it always has the current state in scope.
       stop = streamAgentQuery({
         question,
         repo: activeRepo,
+        model_id: selectedModelId || undefined,
         history,
         onThought: (text) => {
           // Append a thought entry to the trace — rendered as a reasoning bubble
                   <div className="input-hint" aria-hidden="true">{isMac ? "⌘K" : "Ctrl+K"}</div>
                 )}
               </div>
+              {/* Agent mode footer: badge + model selector */}
               {agentMode && (
+                <div className="input-footer-row">
+                  <div className="input-mode-badge" title="Agent mode — runs the ReAct loop (Reason + Act): searches the codebase, reads the result, decides if it needs more context, then searches again. The same pattern production agents use.">✦ Agent</div>
+                  {agentModels.length > 0 && (() => {
+                    const active = agentModels.find(m => m.id === selectedModelId) || agentModels.find(m => m.available) || agentModels[0];
+                    return (
+                      <div className="model-selector" ref={modelMenuRef}>
+                        <button
+                          className="model-selector-btn"
+                          onClick={() => setModelMenuOpen(o => !o)}
+                          title={active?.note}
+                        >
+                          <span className="model-selector-name">{active?.name ?? "Auto"}</span>
+                          {active && <span className={`model-speed-badge model-speed-${active.speed}`}>{active.speed_label}</span>}
+                          {/* chevron */}
+                          <svg className={`model-chevron${modelMenuOpen ? " open" : ""}`} width="10" height="10" viewBox="0 0 10 10" fill="none">
+                            <path d="M2 3.5L5 6.5L8 3.5" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round"/>
+                          </svg>
+                        </button>
+                        {modelMenuOpen && (
+                          <div className="model-menu">
+                            {agentModels.map(m => (
+                              <button
+                                key={m.id}
+                                className={`model-menu-item${m.id === selectedModelId ? " active" : ""}${!m.available ? " unavailable" : ""}`}
+                                onClick={() => { setSelectedModelId(m.id); setModelMenuOpen(false); }}
+                                disabled={!m.available}
+                                title={!m.available ? `Requires ${m.provider} API key` : undefined}
+                              >
+                                <div className="model-menu-row">
+                                  <span className="model-menu-name">{m.name}</span>
+                                  <span className={`model-speed-badge model-speed-${m.speed}`}>{m.speed_label}</span>
+                                  {m.id === selectedModelId && (
+                                    <svg width="12" height="12" viewBox="0 0 12 12" fill="none" style={{marginLeft:"auto",flexShrink:0}}>
+                                      <path d="M2 6l3 3 5-5" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round"/>
+                                    </svg>
+                                  )}
+                                </div>
+                                <div className="model-menu-note">{m.note}</div>
+                                {!m.available && <div className="model-menu-unavail">API key not configured</div>}
+                              </button>
+                            ))}
+                          </div>
+                        )}
+                      </div>
+                    );
+                  })()}
+                </div>
               )}
             </div>
           </>

ui/src/api.js CHANGED Viewed

@@ -2,6 +2,13 @@
 // In production:  set VITE_API_URL in Vercel environment variables
 const BASE = import.meta.env.VITE_API_URL || "http://localhost:8000";
 export async function fetchRepos() {
   const res = await fetch(`${BASE}/repos`);
   if (!res.ok) throw new Error("Failed to fetch repos");
@@ -312,13 +319,13 @@ export function streamQuery({ question, repo, mode, history, onToken, onSources,
  *   onDone(iterations)         — agent finished
  *   onError(msg)               — connection or server error
  */
-export function streamAgentQuery({ question, repo, history, onThought, onToolCall, onToolResult, onToken, onSources, onDone, onError }) {
   const controller = new AbortController();
   fetch(`${BASE}/agent/stream`, {
     method:  "POST",
     headers: { "Content-Type": "application/json" },
-    body:    JSON.stringify({ question, repo: repo || null, history: history || [] }),
     signal:  controller.signal,
   }).then(async (res) => {
     if (!res.ok) { onError?.(`Server error ${res.status}`); return; }

 // In production:  set VITE_API_URL in Vercel environment variables
 const BASE = import.meta.env.VITE_API_URL || "http://localhost:8000";
+export async function fetchAgentModels() {
+  const res = await fetch(`${BASE}/agent/models`);
+  if (!res.ok) return [];
+  const data = await res.json();
+  return data.models || [];
+}
 export async function fetchRepos() {
   const res = await fetch(`${BASE}/repos`);
   if (!res.ok) throw new Error("Failed to fetch repos");
  *   onDone(iterations)         — agent finished
  *   onError(msg)               — connection or server error
  */
+export function streamAgentQuery({ question, repo, model_id, history, onThought, onToolCall, onToolResult, onToken, onSources, onDone, onError }) {
   const controller = new AbortController();
   fetch(`${BASE}/agent/stream`, {
     method:  "POST",
     headers: { "Content-Type": "application/json" },
+    body:    JSON.stringify({ question, repo: repo || null, model_id: model_id || null, history: history || [] }),
     signal:  controller.signal,
   }).then(async (res) => {
     if (!res.ok) { onError?.(`Server error ${res.status}`); return; }

ui/src/index.css CHANGED Viewed

@@ -1566,6 +1566,151 @@ textarea:focus-visible {
   gap: 4px;
 }
 /* ══════════════════════════════════════════════════════════
    STATUS BAR
    ══════════════════════════════════════════════════════════ */

   gap: 4px;
 }
+/* ══════════════════════════════════════════════════════════
+   INPUT FOOTER ROW — wraps agent badge + model selector
+   ══════════════════════════════════════════════════════════ */
+.input-footer-row {
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  margin-top: 5px;
+  /* Remove the margin-top from badge when it's inside this row */
+}
+.input-footer-row .input-mode-badge {
+  margin-top: 0;
+}
+/* ══════════════════════════════════════════════════════════
+   MODEL SELECTOR — dropdown button + floating menu
+   ══════════════════════════════════════════════════════════ */
+.model-selector {
+  position: relative;
+}
+.model-selector-btn {
+  display: inline-flex;
+  align-items: center;
+  gap: 5px;
+  background: none;
+  border: 1px solid rgba(237,228,206,0.10);
+  border-radius: var(--radius-sm);
+  padding: 2px 7px 2px 6px;
+  cursor: pointer;
+  font-family: var(--sans);
+  font-size: 10px;
+  font-weight: 500;
+  color: var(--muted);
+  transition: color var(--transition), border-color var(--transition), background var(--transition);
+  white-space: nowrap;
+  line-height: 1.6;
+}
+.model-selector-btn:hover {
+  color: var(--text-2);
+  border-color: rgba(237,228,206,0.20);
+  background: rgba(237,228,206,0.04);
+}
+.model-selector-name {
+  font-weight: 600;
+  letter-spacing: -0.01em;
+}
+/* Speed badge: fast = green-ish, slow = amber */
+.model-speed-badge {
+  font-size: 9px;
+  font-weight: 600;
+  padding: 0px 4px;
+  border-radius: 3px;
+  letter-spacing: 0;
+}
+.model-speed-fast {
+  background: rgba(114,184,126,0.15);
+  color: var(--green, #72b87e);
+  border: 1px solid rgba(114,184,126,0.25);
+}
+.model-speed-slow {
+  background: rgba(212,171,90,0.12);
+  color: #c9a85a;
+  border: 1px solid rgba(212,171,90,0.22);
+}
+.model-chevron {
+  color: var(--muted);
+  transition: transform var(--transition);
+  flex-shrink: 0;
+}
+.model-chevron.open {
+  transform: rotate(180deg);
+}
+/* The dropdown panel — opens above the button */
+.model-menu {
+  position: absolute;
+  bottom: calc(100% + 6px);
+  left: 0;
+  min-width: 300px;
+  background: var(--surface-3);
+  border: 1px solid var(--border);
+  border-radius: var(--radius);
+  box-shadow:
+    0 -4px 24px rgba(10,6,4,0.55),
+    0 0 0 1px rgba(237,228,206,0.06);
+  z-index: 200;
+  overflow: hidden;
+  padding: 4px;
+}
+.model-menu-item {
+  display: block;
+  width: 100%;
+  text-align: left;
+  background: none;
+  border: none;
+  border-radius: calc(var(--radius) - 3px);
+  padding: 8px 10px;
+  cursor: pointer;
+  font-family: var(--sans);
+  transition: background var(--transition);
+}
+.model-menu-item:hover:not(:disabled) {
+  background: rgba(237,228,206,0.06);
+}
+.model-menu-item.active {
+  background: rgba(212,132,90,0.10);
+}
+.model-menu-item.unavailable {
+  opacity: 0.45;
+  cursor: not-allowed;
+}
+.model-menu-row {
+  display: flex;
+  align-items: center;
+  gap: 6px;
+  margin-bottom: 3px;
+}
+.model-menu-name {
+  font-size: 12px;
+  font-weight: 600;
+  color: var(--text);
+  letter-spacing: -0.01em;
+}
+.model-menu-note {
+  font-size: 11px;
+  color: var(--muted);
+  line-height: 1.45;
+  letter-spacing: -0.01em;
+}
+.model-menu-unavail {
+  font-size: 10px;
+  color: var(--red, #c86858);
+  margin-top: 2px;
+  font-style: italic;
+}
 /* ══════════════════════════════════════════════════════════
    STATUS BAR
    ══════════════════════════════════════════════════════════ */