umanggarg Claude Sonnet 4.6 commited on
Commit
c0f8586
Β·
1 Parent(s): 1f180ad

Add model selector: dropdown UI + /agent/models endpoint

Browse files

- GET /agent/models returns catalog of available models with speed/note metadata
- POST /agent/stream now accepts model_id to override the default priority chain
- AgentService.stream() temporarily swaps client/provider/model per-request,
restoring the default in a finally block so the priority chain is preserved
- Fixed stream() indentation bug: for loop body was outside the try block
- Frontend: model selector dropdown in agent mode footer (like Claude's UI)
shows active model name + speed badge, dropdown lists all models with notes
- selectedModelId persisted to localStorage across page loads

Tested: /agent/models returns correct JSON; /agent/stream with model_id routes
to Cerebras Qwen3-235B and reports correct model in done event.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

backend/main.py CHANGED
@@ -57,7 +57,7 @@ from backend.models.schemas import (
57
  from backend.config import settings
58
  from backend.services.ingestion_service import IngestionService
59
  from backend.services.generation import GenerationService, classify_query
60
- from backend.services.agent import AgentService
61
  from backend.services.diagram_service import DiagramService
62
  from backend.services.repo_map_service import RepoMapService
63
  from backend.mcp_server import mcp, init_services as init_mcp_services
@@ -801,10 +801,42 @@ class AgentStreamRequest(BaseModel):
801
  """Request body for POST /agent/stream β€” agentic RAG with conversation history."""
802
  question: str
803
  repo: str | None = None
 
804
  # Conversation history: prior [{role, content}] turns for follow-up questions.
805
  history: list[dict] = []
806
 
807
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
808
  @app.post("/agent/stream", tags=["agent"])
809
  async def agent_stream(request: AgentStreamRequest):
810
  """
@@ -825,6 +857,7 @@ async def agent_stream(request: AgentStreamRequest):
825
  svc = _agent_service # may be None if no API key configured
826
  question = request.question
827
  repo = request.repo
 
828
  history = request.history[-10:] # cap at 5 exchanges
829
 
830
  async def event_stream():
@@ -851,7 +884,7 @@ async def agent_stream(request: AgentStreamRequest):
851
 
852
  async def _producer():
853
  try:
854
- async for event in svc.stream(question, repo_filter=repo, history=history):
855
  await queue.put(("event", event))
856
  await queue.put(("done", None))
857
  except Exception as exc:
 
57
  from backend.config import settings
58
  from backend.services.ingestion_service import IngestionService
59
  from backend.services.generation import GenerationService, classify_query
60
+ from backend.services.agent import AgentService, AGENT_MODELS
61
  from backend.services.diagram_service import DiagramService
62
  from backend.services.repo_map_service import RepoMapService
63
  from backend.mcp_server import mcp, init_services as init_mcp_services
 
801
  """Request body for POST /agent/stream β€” agentic RAG with conversation history."""
802
  question: str
803
  repo: str | None = None
804
+ model_id: str | None = None # catalog ID from /agent/models; None = auto priority chain
805
  # Conversation history: prior [{role, content}] turns for follow-up questions.
806
  history: list[dict] = []
807
 
808
 
809
+ @app.get("/agent/models", tags=["agent"])
810
+ async def agent_models():
811
+ """
812
+ Return the list of available agent models with metadata for the model selector UI.
813
+
814
+ Each entry has:
815
+ id: unique catalog ID sent back as model_id in /agent/stream requests
816
+ name: display name shown in the UI
817
+ provider: which API this model is served by
818
+ speed: "fast" | "slow" β€” used to show a visual indicator
819
+ speed_label: human-readable time estimate (e.g. "~40s")
820
+ note: one-sentence tradeoff description shown in the tooltip / expanded row
821
+ available: whether the required API key is configured on this server
822
+ """
823
+ from backend.config import settings
824
+ result = []
825
+ for m in AGENT_MODELS:
826
+ key_attr = m.get("requires", "")
827
+ available = bool(getattr(settings, key_attr, ""))
828
+ result.append({
829
+ "id": m["id"],
830
+ "name": m["name"],
831
+ "provider": m["provider"],
832
+ "speed": m["speed"],
833
+ "speed_label": m["speed_label"],
834
+ "note": m["note"],
835
+ "available": available,
836
+ })
837
+ return {"models": result}
838
+
839
+
840
  @app.post("/agent/stream", tags=["agent"])
841
  async def agent_stream(request: AgentStreamRequest):
842
  """
 
857
  svc = _agent_service # may be None if no API key configured
858
  question = request.question
859
  repo = request.repo
860
+ model_id = request.model_id
861
  history = request.history[-10:] # cap at 5 exchanges
862
 
863
  async def event_stream():
 
884
 
885
  async def _producer():
886
  try:
887
+ async for event in svc.stream(question, repo_filter=repo, history=history, model_id=model_id):
888
  await queue.put(("event", event))
889
  await queue.put(("done", None))
890
  except Exception as exc:
backend/models/schemas.py CHANGED
@@ -172,6 +172,11 @@ class AgentRequest(BaseModel):
172
  default=None,
173
  description="Restrict search to a specific repo slug (e.g. 'karpathy/micrograd')",
174
  )
 
 
 
 
 
175
 
176
 
177
  class AgentResponse(BaseModel):
 
172
  default=None,
173
  description="Restrict search to a specific repo slug (e.g. 'karpathy/micrograd')",
174
  )
175
+ model_id: Optional[str] = Field(
176
+ default=None,
177
+ description="Model catalog ID to use (e.g. 'cerebras/qwen3-235b'). Defaults to the server's priority chain.",
178
+ )
179
+ history: list = Field(default_factory=list, description="Prior conversation turns")
180
 
181
 
182
  class AgentResponse(BaseModel):
backend/services/agent.py CHANGED
@@ -297,6 +297,54 @@ def _sources_from_search_result(result_text: str, fallback_repo: str | None) ->
297
  # OpenRouter: free model with confirmed tool-calling support.
298
  # Required headers: HTTP-Referer (for attribution) and X-Title (app name).
299
  # Without HTTP-Referer, free tier access may be denied.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300
  _OPENROUTER_MODEL = "qwen/qwen3-coder:free"
301
 
302
  # Groq models tried in order when the primary is over capacity or decommissioned.
@@ -472,8 +520,8 @@ class AgentService:
472
  base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
473
  )
474
  self._provider = "gemini"
475
- self._model = "gemini-2.0-flash"
476
- print("AgentService: using Google Gemini (gemini-2.0-flash) via MCP tools")
477
  elif settings.openrouter_api_key:
478
  self._client = _openrouter_client(settings.openrouter_api_key)
479
  self._provider = "openrouter"
@@ -557,7 +605,11 @@ class AgentService:
557
  }
558
 
559
  async def stream(
560
- self, question: str, repo_filter: str | None = None, history: list[dict] | None = None
 
 
 
 
561
  ) -> AsyncIterator[dict]:
562
  """
563
  Stream agent progress as an async generator.
@@ -581,129 +633,146 @@ class AgentService:
581
  we re-run with stream=True so tokens arrive in real time.
582
  This is one extra LLM call but delivers genuine streaming UX.
583
  """
584
- # Discover tools from MCP server (cached after first call)
585
- mcp_tools = await self.mcp.list_tools()
586
- messages = self._build_initial_messages(question, repo_filter, history)
587
-
588
- # Clear session notes from any previous run so this conversation starts fresh.
589
- # Note: we import here to avoid circular imports at module load time.
590
- from backend.mcp_server import clear_notes
591
- clear_notes()
592
-
593
- # Loop detection: skip duplicate tool calls in the stream path too.
594
- seen_calls: set[tuple] = set()
595
-
596
- # Collect source references from tool calls for the sources panel.
597
- # Keyed by (repo, filepath, start_line) to deduplicate across iterations.
598
- collected_sources: dict[tuple, dict] = {}
599
-
600
- for iteration in range(self.MAX_ITERATIONS):
601
- # Run sync LLM call in thread pool β€” doesn't block the event loop
602
- # Pass raw mcp_tools so _call_llm can reformat if provider switches mid-run
603
- step = await asyncio.to_thread(self._call_llm, messages, mcp_tools)
604
-
605
- if step["done"]:
606
- # Stream the final answer with real token-by-token delivery.
607
- # We pass messages (with all tool results) to the streaming call
608
- # and tell the LLM not to use tools (tool_choice="none") so it
609
- # goes straight to answering.
610
- async for token in self._stream_final_answer(messages, mcp_tools):
611
- yield {"type": "token", "text": token}
612
- # Emit sources collected across all tool calls before done event
613
- if collected_sources:
614
- yield {"type": "sources", "sources": list(collected_sources.values())}
615
- yield {"type": "done", "iterations": iteration + 1, "model": self._model}
616
- return
617
 
618
- messages.append(step["assistant_message"])
619
-
620
- # Emit any pre-tool reasoning text the LLM produced before calling tools.
621
- # This lets the UI show "thought bubbles" in the trace timeline β€”
622
- # the user sees WHY each tool was chosen, not just WHAT was called.
623
- thought = _extract_thought(step["assistant_message"], self._provider)
624
- if thought:
625
- yield {"type": "thought", "text": thought}
626
-
627
- # ── Parallel tool execution ───────────────────────────────────────
628
- # The LLM may return multiple tool calls in one turn (e.g. search_code
629
- # called 2-3 times for different query angles simultaneously).
630
- # Instead of serial execution, we:
631
- # 1. Emit tool_call events for all new (non-duplicate) calls upfront
632
- # 2. Run them concurrently with asyncio.gather
633
- # 3. Emit tool_result events for all after they complete
634
- #
635
- # This reduces latency proportionally to the number of parallel calls
636
- # (3 serial 500ms searches β†’ 1 parallel 500ms round trip).
637
-
638
- # Separate new calls from duplicates
639
- new_calls: list[dict] = []
640
- for tc in step["tool_calls"]:
641
- call_key = (tc["name"], tuple(sorted(tc["input"].items())))
642
- if call_key in seen_calls:
643
- dup_msg = f"[Skipped duplicate {tc['name']} call β€” already ran with these arguments]"
644
- yield {"type": "tool_result", "tool": tc["name"], "output": dup_msg}
645
- messages.append(self._build_tool_result(tc["id"], tc["name"], dup_msg))
646
- else:
647
- seen_calls.add(call_key)
648
- new_calls.append(tc)
649
- # Emit tool_call events immediately so UI shows them in parallel
650
- yield {"type": "tool_call", "tool": tc["name"], "input": tc["input"]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
651
 
652
- if not new_calls:
653
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
654
 
655
- # Execute all new calls concurrently β€” MCP calls are async HTTP round trips
656
- async def _run_tool(tc: dict) -> str:
657
- try:
658
- return await self.mcp.call_tool(tc["name"], tc["input"])
659
- except Exception as e:
660
- return f"Tool error: {e}"
661
-
662
- parallel_results = await asyncio.gather(*[_run_tool(tc) for tc in new_calls])
663
-
664
- # Process results in the same order as the calls
665
- for tc, result in zip(new_calls, parallel_results):
666
- # Collect source metadata for the sources panel
667
- if tc["name"] == "get_file_chunk":
668
- src = _source_from_chunk_call(tc["input"], result)
669
- if src:
670
- key = (src["repo"], src["filepath"], src["start_line"])
671
- collected_sources[key] = src
672
-
673
- if tc["name"] in ("search_code", "find_callers", "search_symbol") and not result.startswith("No results"):
674
- for src in _sources_from_search_result(result, tc["input"].get("repo") or repo_filter):
675
- key = (src["repo"], src["filepath"], src["start_line"])
676
- collected_sources[key] = src
677
-
678
- # read_file returns a whole file β€” record it as a single source entry
679
- if tc["name"] == "read_file" and tc["input"].get("filepath"):
680
- repo = tc["input"].get("repo", repo_filter or "")
681
- filepath = tc["input"]["filepath"]
682
- key = (repo, filepath, 0)
683
- if key not in collected_sources:
684
- ext = "." + filepath.rsplit(".", 1)[-1].lower() if "." in filepath else ""
685
- lang = {"py": "python", "js": "javascript", "ts": "typescript",
686
- "go": "go", "rs": "rust", "java": "java"}.get(ext.lstrip("."), "text")
687
- collected_sources[key] = {
688
- "repo": repo, "filepath": filepath, "language": lang,
689
- "chunk_type": "file", "name": filepath.rsplit("/", 1)[-1],
690
- "start_line": 1, "end_line": result.count("\n"),
691
- "score": 1.0, "text": "",
692
- }
693
-
694
- display = result[:500] + "…" if len(result) > 500 else result
695
- yield {"type": "tool_result", "tool": tc["name"], "output": display}
696
- messages.append(self._build_tool_result(tc["id"], tc["name"], result))
697
 
698
- # MAX_ITERATIONS hit β€” LLM never voluntarily stopped, but it has gathered
699
- # context from all its tool calls. Force a final answer from that context
700
- # rather than returning silence.
701
- async for token in self._stream_final_answer(messages, mcp_tools):
702
- yield {"type": "token", "text": token}
703
- # Emit any collected sources even when we hit the iteration cap
704
- if collected_sources:
705
- yield {"type": "sources", "sources": list(collected_sources.values())}
706
- yield {"type": "done", "iterations": self.MAX_ITERATIONS, "model": self._model}
 
 
 
 
 
707
 
708
  async def _stream_final_answer(self, messages: list, mcp_tools: list) -> AsyncIterator[str]:
709
  """
@@ -813,8 +882,8 @@ class AgentService:
813
  base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
814
  )
815
  self._provider = "gemini"
816
- self._model = "gemini-2.0-flash"
817
- print("AgentService: Cerebras limit hit β€” switched to Gemini (gemini-2.0-flash)")
818
  return True
819
  if self._provider in ("cerebras", "gemini") and settings.openrouter_api_key:
820
  self._client = _openrouter_client(settings.openrouter_api_key)
 
297
  # OpenRouter: free model with confirmed tool-calling support.
298
  # Required headers: HTTP-Referer (for attribution) and X-Title (app name).
299
  # Without HTTP-Referer, free tier access may be denied.
300
+ # ── Model catalog ─────────────────────────────────────────────────────────────
301
+ # Each entry describes a model the user can select from the UI.
302
+ # "requires" is the settings key that must be non-empty for this model to appear.
303
+ # "provider" must match the strings used in _call_groq / _call_anthropic routing.
304
+ AGENT_MODELS: list[dict] = [
305
+ {
306
+ "id": "cerebras/qwen3-235b",
307
+ "name": "Qwen3 235B",
308
+ "provider": "cerebras",
309
+ "model": "qwen-3-235b-a22b-instruct-2507",
310
+ "requires": "cerebras_api_key",
311
+ "speed": "fast",
312
+ "speed_label": "~40s",
313
+ "note": "Best balance. Fast inference (1400 tok/s), strong tool use, generous free quota.",
314
+ },
315
+ {
316
+ "id": "google/gemma4-31b",
317
+ "name": "Gemma 4 31B",
318
+ "provider": "gemini",
319
+ "model": "gemma-4-31b-it",
320
+ "requires": "gemini_api_key",
321
+ "speed": "slow",
322
+ "speed_label": "~90s",
323
+ "note": "Highest quality. Reads actual source files. Slower but thorough. Free via AI Studio.",
324
+ },
325
+ {
326
+ "id": "google/gemini-flash",
327
+ "name": "Gemini 2.0 Flash",
328
+ "provider": "gemini",
329
+ "model": "gemini-2.0-flash",
330
+ "requires": "gemini_api_key",
331
+ "speed": "fast",
332
+ "speed_label": "~15s",
333
+ "note": "Fastest. Lower quality than Gemma 4. 1500 req/day free limit.",
334
+ },
335
+ ]
336
+
337
+ def _make_client(model_entry: dict):
338
+ """Instantiate the right API client for a model catalog entry."""
339
+ from openai import OpenAI
340
+ if model_entry["provider"] == "cerebras":
341
+ return OpenAI(api_key=settings.cerebras_api_key, base_url="https://api.cerebras.ai/v1")
342
+ else: # gemini
343
+ return OpenAI(
344
+ api_key=settings.gemini_api_key,
345
+ base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
346
+ )
347
+
348
  _OPENROUTER_MODEL = "qwen/qwen3-coder:free"
349
 
350
  # Groq models tried in order when the primary is over capacity or decommissioned.
 
520
  base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
521
  )
522
  self._provider = "gemini"
523
+ self._model = "gemma-4-31b-it"
524
+ print("AgentService: using Gemma 4 31B (gemma-4-31b-it) via MCP tools")
525
  elif settings.openrouter_api_key:
526
  self._client = _openrouter_client(settings.openrouter_api_key)
527
  self._provider = "openrouter"
 
605
  }
606
 
607
  async def stream(
608
+ self,
609
+ question: str,
610
+ repo_filter: str | None = None,
611
+ history: list[dict] | None = None,
612
+ model_id: str | None = None,
613
  ) -> AsyncIterator[dict]:
614
  """
615
  Stream agent progress as an async generator.
 
633
  we re-run with stream=True so tokens arrive in real time.
634
  This is one extra LLM call but delivers genuine streaming UX.
635
  """
636
+ # ── Per-request model override ────────────────────────────────────────
637
+ # If the user selected a specific model in the UI, temporarily swap to it.
638
+ # We save/restore self._client/provider/model in a finally block so the
639
+ # default priority chain is preserved for the next request.
640
+ _orig = (self._client, self._provider, self._model)
641
+ entry = next((m for m in AGENT_MODELS if m["id"] == model_id), None)
642
+ if entry:
643
+ self._client = _make_client(entry)
644
+ self._provider = entry["provider"]
645
+ self._model = entry["model"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
646
 
647
+ try:
648
+ # Discover tools from MCP server (cached after first call)
649
+ mcp_tools = await self.mcp.list_tools()
650
+ messages = self._build_initial_messages(question, repo_filter, history)
651
+
652
+ # Clear session notes from any previous run so this conversation starts fresh.
653
+ # Note: we import here to avoid circular imports at module load time.
654
+ from backend.mcp_server import clear_notes
655
+ clear_notes()
656
+
657
+ # Loop detection: skip duplicate tool calls in the stream path too.
658
+ seen_calls: set[tuple] = set()
659
+
660
+ # Collect source references from tool calls for the sources panel.
661
+ # Keyed by (repo, filepath, start_line) to deduplicate across iterations.
662
+ collected_sources: dict[tuple, dict] = {}
663
+
664
+ for iteration in range(self.MAX_ITERATIONS):
665
+ # Run sync LLM call in thread pool β€” doesn't block the event loop
666
+ # Pass raw mcp_tools so _call_llm can reformat if provider switches mid-run
667
+ step = await asyncio.to_thread(self._call_llm, messages, mcp_tools)
668
+
669
+ if step["done"]:
670
+ # Stream the final answer with real token-by-token delivery.
671
+ # We pass messages (with all tool results) to the streaming call
672
+ # and tell the LLM not to use tools (tool_choice="none") so it
673
+ # goes straight to answering.
674
+ async for token in self._stream_final_answer(messages, mcp_tools):
675
+ yield {"type": "token", "text": token}
676
+ # Emit sources collected across all tool calls before done event
677
+ if collected_sources:
678
+ yield {"type": "sources", "sources": list(collected_sources.values())}
679
+ yield {"type": "done", "iterations": iteration + 1, "model": self._model}
680
+ return
681
+
682
+ messages.append(step["assistant_message"])
683
+
684
+ # Emit any pre-tool reasoning text the LLM produced before calling tools.
685
+ # This lets the UI show "thought bubbles" in the trace timeline β€”
686
+ # the user sees WHY each tool was chosen, not just WHAT was called.
687
+ thought = _extract_thought(step["assistant_message"], self._provider)
688
+ if thought:
689
+ yield {"type": "thought", "text": thought}
690
+
691
+ # ── Parallel tool execution ───────────────────────────────────────
692
+ # The LLM may return multiple tool calls in one turn (e.g. search_code
693
+ # called 2-3 times for different query angles simultaneously).
694
+ # Instead of serial execution, we:
695
+ # 1. Emit tool_call events for all new (non-duplicate) calls upfront
696
+ # 2. Run them concurrently with asyncio.gather
697
+ # 3. Emit tool_result events for all after they complete
698
+ #
699
+ # This reduces latency proportionally to the number of parallel calls
700
+ # (3 serial 500ms searches β†’ 1 parallel 500ms round trip).
701
+
702
+ # Separate new calls from duplicates
703
+ new_calls: list[dict] = []
704
+ for tc in step["tool_calls"]:
705
+ call_key = (tc["name"], tuple(sorted(tc["input"].items())))
706
+ if call_key in seen_calls:
707
+ dup_msg = f"[Skipped duplicate {tc['name']} call β€” already ran with these arguments]"
708
+ yield {"type": "tool_result", "tool": tc["name"], "output": dup_msg}
709
+ messages.append(self._build_tool_result(tc["id"], tc["name"], dup_msg))
710
+ else:
711
+ seen_calls.add(call_key)
712
+ new_calls.append(tc)
713
+ # Emit tool_call events immediately so UI shows them in parallel
714
+ yield {"type": "tool_call", "tool": tc["name"], "input": tc["input"]}
715
+
716
+ if not new_calls:
717
+ continue
718
 
719
+ # Execute all new calls concurrently β€” MCP calls are async HTTP round trips
720
+ async def _run_tool(tc: dict) -> str:
721
+ try:
722
+ return await self.mcp.call_tool(tc["name"], tc["input"])
723
+ except Exception as e:
724
+ return f"Tool error: {e}"
725
+
726
+ parallel_results = await asyncio.gather(*[_run_tool(tc) for tc in new_calls])
727
+
728
+ # Process results in the same order as the calls
729
+ for tc, result in zip(new_calls, parallel_results):
730
+ # Collect source metadata for the sources panel
731
+ if tc["name"] == "get_file_chunk":
732
+ src = _source_from_chunk_call(tc["input"], result)
733
+ if src:
734
+ key = (src["repo"], src["filepath"], src["start_line"])
735
+ collected_sources[key] = src
736
+
737
+ if tc["name"] in ("search_code", "find_callers", "search_symbol") and not result.startswith("No results"):
738
+ for src in _sources_from_search_result(result, tc["input"].get("repo") or repo_filter):
739
+ key = (src["repo"], src["filepath"], src["start_line"])
740
+ collected_sources[key] = src
741
+
742
+ # read_file returns a whole file β€” record it as a single source entry
743
+ if tc["name"] == "read_file" and tc["input"].get("filepath"):
744
+ repo = tc["input"].get("repo", repo_filter or "")
745
+ filepath = tc["input"]["filepath"]
746
+ key = (repo, filepath, 0)
747
+ if key not in collected_sources:
748
+ ext = "." + filepath.rsplit(".", 1)[-1].lower() if "." in filepath else ""
749
+ lang = {"py": "python", "js": "javascript", "ts": "typescript",
750
+ "go": "go", "rs": "rust", "java": "java"}.get(ext.lstrip("."), "text")
751
+ collected_sources[key] = {
752
+ "repo": repo, "filepath": filepath, "language": lang,
753
+ "chunk_type": "file", "name": filepath.rsplit("/", 1)[-1],
754
+ "start_line": 1, "end_line": result.count("\n"),
755
+ "score": 1.0, "text": "",
756
+ }
757
 
758
+ display = result[:500] + "…" if len(result) > 500 else result
759
+ yield {"type": "tool_result", "tool": tc["name"], "output": display}
760
+ messages.append(self._build_tool_result(tc["id"], tc["name"], result))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
761
 
762
+ # MAX_ITERATIONS hit β€” LLM never voluntarily stopped, but it has gathered
763
+ # context from all its tool calls. Force a final answer from that context
764
+ # rather than returning silence.
765
+ async for token in self._stream_final_answer(messages, mcp_tools):
766
+ yield {"type": "token", "text": token}
767
+ # Emit any collected sources even when we hit the iteration cap
768
+ if collected_sources:
769
+ yield {"type": "sources", "sources": list(collected_sources.values())}
770
+ yield {"type": "done", "iterations": self.MAX_ITERATIONS, "model": self._model}
771
+
772
+ finally:
773
+ # Restore original client/provider/model so the next request uses the
774
+ # default priority chain regardless of what model was selected this time.
775
+ self._client, self._provider, self._model = _orig
776
 
777
  async def _stream_final_answer(self, messages: list, mcp_tools: list) -> AsyncIterator[str]:
778
  """
 
882
  base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
883
  )
884
  self._provider = "gemini"
885
+ self._model = "gemma-4-31b-it"
886
+ print("AgentService: Cerebras limit hit β€” switched to Gemma 4 31B (gemma-4-31b-it)")
887
  return True
888
  if self._provider in ("cerebras", "gemini") and settings.openrouter_api_key:
889
  self._client = _openrouter_client(settings.openrouter_api_key)
ui/src/App.jsx CHANGED
@@ -2,7 +2,7 @@ import { useState, useEffect, useRef, useCallback } from "react";
2
  import Sidebar from "./components/Sidebar";
3
  import Message from "./components/Message";
4
  import DiagramView from "./components/DiagramView";
5
- import { fetchRepos, streamQuery, streamAgentQuery, fetchMcpStatus, fetchMcpPrompt } from "./api";
6
 
7
  export default function App() {
8
  const [repos, setRepos] = useState([]);
@@ -31,6 +31,14 @@ export default function App() {
31
  const [promptMenu, setPromptMenu] = useState(false); // dropdown visible
32
  const [promptFilter, setPromptFilter] = useState(""); // text after "/"
33
 
 
 
 
 
 
 
 
 
34
  const bottomRef = useRef(null);
35
  const scrollRef = useRef(null);
36
  const latestAssistantRef = useRef(null); // top of the current streaming assistant message
@@ -83,6 +91,33 @@ export default function App() {
83
  useEffect(() => { streamingRef.current = streaming; }, [streaming]);
84
  // Persist agent mode preference across page loads
85
  useEffect(() => { localStorage.setItem('ghrc_agentMode', agentMode); }, [agentMode]);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  // Keep handleSubmitRef pointing at the latest handleSubmit (avoids stale closures
87
  // in the rate-limit countdown which captures this ref via closure).
88
  // We update it on every render so it always has the current state in scope.
@@ -390,6 +425,7 @@ export default function App() {
390
  stop = streamAgentQuery({
391
  question,
392
  repo: activeRepo,
 
393
  history,
394
  onThought: (text) => {
395
  // Append a thought entry to the trace β€” rendered as a reasoning bubble
@@ -846,9 +882,55 @@ export default function App() {
846
  <div className="input-hint" aria-hidden="true">{isMac ? "⌘K" : "Ctrl+K"}</div>
847
  )}
848
  </div>
849
- {/* Agent mode indicator β€” small label below the textarea row */}
850
  {agentMode && (
851
- <div className="input-mode-badge" aria-hidden="true" title="Agent mode β€” runs the ReAct loop (Reason + Act): searches the codebase, reads the result, decides if it needs more context, then searches again. The same pattern production agents use.">✦ Agent</div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
852
  )}
853
  </div>
854
  </>
 
2
  import Sidebar from "./components/Sidebar";
3
  import Message from "./components/Message";
4
  import DiagramView from "./components/DiagramView";
5
+ import { fetchRepos, streamQuery, streamAgentQuery, fetchMcpStatus, fetchMcpPrompt, fetchAgentModels } from "./api";
6
 
7
  export default function App() {
8
  const [repos, setRepos] = useState([]);
 
31
  const [promptMenu, setPromptMenu] = useState(false); // dropdown visible
32
  const [promptFilter, setPromptFilter] = useState(""); // text after "/"
33
 
34
+ // Model selector: available models fetched from /agent/models
35
+ const [agentModels, setAgentModels] = useState([]);
36
+ const [selectedModelId, setSelectedModelId] = useState(
37
+ () => localStorage.getItem('ghrc_selectedModel') || null
38
+ );
39
+ const [modelMenuOpen, setModelMenuOpen] = useState(false);
40
+ const modelMenuRef = useRef(null);
41
+
42
  const bottomRef = useRef(null);
43
  const scrollRef = useRef(null);
44
  const latestAssistantRef = useRef(null); // top of the current streaming assistant message
 
91
  useEffect(() => { streamingRef.current = streaming; }, [streaming]);
92
  // Persist agent mode preference across page loads
93
  useEffect(() => { localStorage.setItem('ghrc_agentMode', agentMode); }, [agentMode]);
94
+ // Persist selected model
95
+ useEffect(() => {
96
+ if (selectedModelId) localStorage.setItem('ghrc_selectedModel', selectedModelId);
97
+ else localStorage.removeItem('ghrc_selectedModel');
98
+ }, [selectedModelId]);
99
+ // Fetch available agent models once on mount
100
+ useEffect(() => {
101
+ fetchAgentModels().then(models => {
102
+ setAgentModels(models);
103
+ // If no model selected yet, default to the first available one
104
+ setSelectedModelId(prev => {
105
+ if (prev && models.some(m => m.id === prev)) return prev;
106
+ const first = models.find(m => m.available);
107
+ return first ? first.id : null;
108
+ });
109
+ });
110
+ }, []);
111
+ // Close model menu when clicking outside
112
+ useEffect(() => {
113
+ function onClickOutside(e) {
114
+ if (modelMenuRef.current && !modelMenuRef.current.contains(e.target)) {
115
+ setModelMenuOpen(false);
116
+ }
117
+ }
118
+ document.addEventListener("mousedown", onClickOutside);
119
+ return () => document.removeEventListener("mousedown", onClickOutside);
120
+ }, []);
121
  // Keep handleSubmitRef pointing at the latest handleSubmit (avoids stale closures
122
  // in the rate-limit countdown which captures this ref via closure).
123
  // We update it on every render so it always has the current state in scope.
 
425
  stop = streamAgentQuery({
426
  question,
427
  repo: activeRepo,
428
+ model_id: selectedModelId || undefined,
429
  history,
430
  onThought: (text) => {
431
  // Append a thought entry to the trace β€” rendered as a reasoning bubble
 
882
  <div className="input-hint" aria-hidden="true">{isMac ? "⌘K" : "Ctrl+K"}</div>
883
  )}
884
  </div>
885
+ {/* Agent mode footer: badge + model selector */}
886
  {agentMode && (
887
+ <div className="input-footer-row">
888
+ <div className="input-mode-badge" title="Agent mode β€” runs the ReAct loop (Reason + Act): searches the codebase, reads the result, decides if it needs more context, then searches again. The same pattern production agents use.">✦ Agent</div>
889
+ {agentModels.length > 0 && (() => {
890
+ const active = agentModels.find(m => m.id === selectedModelId) || agentModels.find(m => m.available) || agentModels[0];
891
+ return (
892
+ <div className="model-selector" ref={modelMenuRef}>
893
+ <button
894
+ className="model-selector-btn"
895
+ onClick={() => setModelMenuOpen(o => !o)}
896
+ title={active?.note}
897
+ >
898
+ <span className="model-selector-name">{active?.name ?? "Auto"}</span>
899
+ {active && <span className={`model-speed-badge model-speed-${active.speed}`}>{active.speed_label}</span>}
900
+ {/* chevron */}
901
+ <svg className={`model-chevron${modelMenuOpen ? " open" : ""}`} width="10" height="10" viewBox="0 0 10 10" fill="none">
902
+ <path d="M2 3.5L5 6.5L8 3.5" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round"/>
903
+ </svg>
904
+ </button>
905
+ {modelMenuOpen && (
906
+ <div className="model-menu">
907
+ {agentModels.map(m => (
908
+ <button
909
+ key={m.id}
910
+ className={`model-menu-item${m.id === selectedModelId ? " active" : ""}${!m.available ? " unavailable" : ""}`}
911
+ onClick={() => { setSelectedModelId(m.id); setModelMenuOpen(false); }}
912
+ disabled={!m.available}
913
+ title={!m.available ? `Requires ${m.provider} API key` : undefined}
914
+ >
915
+ <div className="model-menu-row">
916
+ <span className="model-menu-name">{m.name}</span>
917
+ <span className={`model-speed-badge model-speed-${m.speed}`}>{m.speed_label}</span>
918
+ {m.id === selectedModelId && (
919
+ <svg width="12" height="12" viewBox="0 0 12 12" fill="none" style={{marginLeft:"auto",flexShrink:0}}>
920
+ <path d="M2 6l3 3 5-5" stroke="currentColor" strokeWidth="1.5" strokeLinecap="round" strokeLinejoin="round"/>
921
+ </svg>
922
+ )}
923
+ </div>
924
+ <div className="model-menu-note">{m.note}</div>
925
+ {!m.available && <div className="model-menu-unavail">API key not configured</div>}
926
+ </button>
927
+ ))}
928
+ </div>
929
+ )}
930
+ </div>
931
+ );
932
+ })()}
933
+ </div>
934
  )}
935
  </div>
936
  </>
ui/src/api.js CHANGED
@@ -2,6 +2,13 @@
2
  // In production: set VITE_API_URL in Vercel environment variables
3
  const BASE = import.meta.env.VITE_API_URL || "http://localhost:8000";
4
 
 
 
 
 
 
 
 
5
  export async function fetchRepos() {
6
  const res = await fetch(`${BASE}/repos`);
7
  if (!res.ok) throw new Error("Failed to fetch repos");
@@ -312,13 +319,13 @@ export function streamQuery({ question, repo, mode, history, onToken, onSources,
312
  * onDone(iterations) β€” agent finished
313
  * onError(msg) β€” connection or server error
314
  */
315
- export function streamAgentQuery({ question, repo, history, onThought, onToolCall, onToolResult, onToken, onSources, onDone, onError }) {
316
  const controller = new AbortController();
317
 
318
  fetch(`${BASE}/agent/stream`, {
319
  method: "POST",
320
  headers: { "Content-Type": "application/json" },
321
- body: JSON.stringify({ question, repo: repo || null, history: history || [] }),
322
  signal: controller.signal,
323
  }).then(async (res) => {
324
  if (!res.ok) { onError?.(`Server error ${res.status}`); return; }
 
2
  // In production: set VITE_API_URL in Vercel environment variables
3
  const BASE = import.meta.env.VITE_API_URL || "http://localhost:8000";
4
 
5
+ export async function fetchAgentModels() {
6
+ const res = await fetch(`${BASE}/agent/models`);
7
+ if (!res.ok) return [];
8
+ const data = await res.json();
9
+ return data.models || [];
10
+ }
11
+
12
  export async function fetchRepos() {
13
  const res = await fetch(`${BASE}/repos`);
14
  if (!res.ok) throw new Error("Failed to fetch repos");
 
319
  * onDone(iterations) β€” agent finished
320
  * onError(msg) β€” connection or server error
321
  */
322
+ export function streamAgentQuery({ question, repo, model_id, history, onThought, onToolCall, onToolResult, onToken, onSources, onDone, onError }) {
323
  const controller = new AbortController();
324
 
325
  fetch(`${BASE}/agent/stream`, {
326
  method: "POST",
327
  headers: { "Content-Type": "application/json" },
328
+ body: JSON.stringify({ question, repo: repo || null, model_id: model_id || null, history: history || [] }),
329
  signal: controller.signal,
330
  }).then(async (res) => {
331
  if (!res.ok) { onError?.(`Server error ${res.status}`); return; }
ui/src/index.css CHANGED
@@ -1566,6 +1566,151 @@ textarea:focus-visible {
1566
  gap: 4px;
1567
  }
1568
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1569
  /* ══════════════════════════════════════════════════════════
1570
  STATUS BAR
1571
  ══════════════════════════════════════════════════════════ */
 
1566
  gap: 4px;
1567
  }
1568
 
1569
+ /* ══════════════════════════════════════════════════════════
1570
+ INPUT FOOTER ROW β€” wraps agent badge + model selector
1571
+ ══════════════════════════════════════════════════════════ */
1572
+ .input-footer-row {
1573
+ display: flex;
1574
+ align-items: center;
1575
+ gap: 8px;
1576
+ margin-top: 5px;
1577
+ /* Remove the margin-top from badge when it's inside this row */
1578
+ }
1579
+ .input-footer-row .input-mode-badge {
1580
+ margin-top: 0;
1581
+ }
1582
+
1583
+ /* ══════════════════════════════════════════════════════════
1584
+ MODEL SELECTOR β€” dropdown button + floating menu
1585
+ ══════════════════════════════════════════════════════════ */
1586
+ .model-selector {
1587
+ position: relative;
1588
+ }
1589
+
1590
+ .model-selector-btn {
1591
+ display: inline-flex;
1592
+ align-items: center;
1593
+ gap: 5px;
1594
+ background: none;
1595
+ border: 1px solid rgba(237,228,206,0.10);
1596
+ border-radius: var(--radius-sm);
1597
+ padding: 2px 7px 2px 6px;
1598
+ cursor: pointer;
1599
+ font-family: var(--sans);
1600
+ font-size: 10px;
1601
+ font-weight: 500;
1602
+ color: var(--muted);
1603
+ transition: color var(--transition), border-color var(--transition), background var(--transition);
1604
+ white-space: nowrap;
1605
+ line-height: 1.6;
1606
+ }
1607
+ .model-selector-btn:hover {
1608
+ color: var(--text-2);
1609
+ border-color: rgba(237,228,206,0.20);
1610
+ background: rgba(237,228,206,0.04);
1611
+ }
1612
+
1613
+ .model-selector-name {
1614
+ font-weight: 600;
1615
+ letter-spacing: -0.01em;
1616
+ }
1617
+
1618
+ /* Speed badge: fast = green-ish, slow = amber */
1619
+ .model-speed-badge {
1620
+ font-size: 9px;
1621
+ font-weight: 600;
1622
+ padding: 0px 4px;
1623
+ border-radius: 3px;
1624
+ letter-spacing: 0;
1625
+ }
1626
+ .model-speed-fast {
1627
+ background: rgba(114,184,126,0.15);
1628
+ color: var(--green, #72b87e);
1629
+ border: 1px solid rgba(114,184,126,0.25);
1630
+ }
1631
+ .model-speed-slow {
1632
+ background: rgba(212,171,90,0.12);
1633
+ color: #c9a85a;
1634
+ border: 1px solid rgba(212,171,90,0.22);
1635
+ }
1636
+
1637
+ .model-chevron {
1638
+ color: var(--muted);
1639
+ transition: transform var(--transition);
1640
+ flex-shrink: 0;
1641
+ }
1642
+ .model-chevron.open {
1643
+ transform: rotate(180deg);
1644
+ }
1645
+
1646
+ /* The dropdown panel β€” opens above the button */
1647
+ .model-menu {
1648
+ position: absolute;
1649
+ bottom: calc(100% + 6px);
1650
+ left: 0;
1651
+ min-width: 300px;
1652
+ background: var(--surface-3);
1653
+ border: 1px solid var(--border);
1654
+ border-radius: var(--radius);
1655
+ box-shadow:
1656
+ 0 -4px 24px rgba(10,6,4,0.55),
1657
+ 0 0 0 1px rgba(237,228,206,0.06);
1658
+ z-index: 200;
1659
+ overflow: hidden;
1660
+ padding: 4px;
1661
+ }
1662
+
1663
+ .model-menu-item {
1664
+ display: block;
1665
+ width: 100%;
1666
+ text-align: left;
1667
+ background: none;
1668
+ border: none;
1669
+ border-radius: calc(var(--radius) - 3px);
1670
+ padding: 8px 10px;
1671
+ cursor: pointer;
1672
+ font-family: var(--sans);
1673
+ transition: background var(--transition);
1674
+ }
1675
+ .model-menu-item:hover:not(:disabled) {
1676
+ background: rgba(237,228,206,0.06);
1677
+ }
1678
+ .model-menu-item.active {
1679
+ background: rgba(212,132,90,0.10);
1680
+ }
1681
+ .model-menu-item.unavailable {
1682
+ opacity: 0.45;
1683
+ cursor: not-allowed;
1684
+ }
1685
+
1686
+ .model-menu-row {
1687
+ display: flex;
1688
+ align-items: center;
1689
+ gap: 6px;
1690
+ margin-bottom: 3px;
1691
+ }
1692
+
1693
+ .model-menu-name {
1694
+ font-size: 12px;
1695
+ font-weight: 600;
1696
+ color: var(--text);
1697
+ letter-spacing: -0.01em;
1698
+ }
1699
+
1700
+ .model-menu-note {
1701
+ font-size: 11px;
1702
+ color: var(--muted);
1703
+ line-height: 1.45;
1704
+ letter-spacing: -0.01em;
1705
+ }
1706
+
1707
+ .model-menu-unavail {
1708
+ font-size: 10px;
1709
+ color: var(--red, #c86858);
1710
+ margin-top: 2px;
1711
+ font-style: italic;
1712
+ }
1713
+
1714
  /* ══════════════════════════════════════════════════════════
1715
  STATUS BAR
1716
  ══════════════════════════════════════════════════════════ */