Spaces:

umanggarg
/

cartographer

Running

umanggarg Claude Sonnet 4.6 commited on Mar 24

Commit

6888bb5

1 Parent(s): ea8edd8

Phase 5: MCP server + Agentic RAG (ReAct loop)

Backend:
- backend/services/agent.py — AgentService with ReAct loop using Anthropic
tool use. Detailed docstrings explain agent vs plain RAG, tool use message
format, and stopping conditions. run() + stream() methods.
- backend/main.py — /agent/query (sync) and /agent/stream (SSE) endpoints.
AgentService initialised at startup only when ANTHROPIC_API_KEY is set.
- backend/models/schemas.py — AgentRequest, AgentResponse, AgentToolCall schemas.

MCP server:
- mcp_server/server.py — Full MCP server exposing RAG as MCP primitives.
Tools: search_code, list_repos, get_file_chunk, find_callers.
Resources: indexed repos as rag://repos/owner/name URIs.
Prompts: explain-function, repo-overview (slash commands in Claude Desktop).
Docstring explains MCP from scratch: primitives, transports, why it exists.

UI:
- Agent mode toggle in sidebar (RAG vs Agent ✦)
- Live tool-call trace: each search step appears as the agent runs, with spinner
on the in-flight call. Collapses to a toggle after completion.
- api.js — streamAgentQuery() with tool_call/tool_result/done SSE events.
- index.css — .agent-trace.live with accent border, .agent-step.pending/.done.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (11) hide show

backend/main.py +123 -1
backend/models/schemas.py +53 -0
backend/services/agent.py +400 -0
mcp_server/__init__.py +0 -0
mcp_server/server.py +499 -0
requirements.txt +1 -0
ui/src/App.jsx +98 -41
ui/src/api.js +63 -0
ui/src/components/Message.jsx +84 -4
ui/src/components/Sidebar.jsx +36 -12
ui/src/index.css +106 -0

backend/main.py CHANGED Viewed

@@ -30,6 +30,8 @@ Endpoints:
   POST /search              — retrieve chunks (no generation)
   POST /query               — RAG: retrieve + generate answer
   GET  /query/stream        — RAG with streaming SSE response
 """
 from contextlib import asynccontextmanager
@@ -44,10 +46,12 @@ from backend.models.schemas import (
     SearchRequest, SearchResponse, CodeChunk,
     QueryRequest, QueryResponse,
     ReposResponse, RepoInfo,
 )
 from backend.config import settings
 from backend.services.ingestion_service import IngestionService
 from backend.services.generation import GenerationService, classify_query
 from retrieval.retrieval import RetrievalService
@@ -58,6 +62,7 @@ from retrieval.retrieval import RetrievalService
 _ingestion_service: IngestionService | None = None
 _retrieval_service: RetrievalService | None = None
 _generation_service: GenerationService | None = None
 @asynccontextmanager
@@ -71,11 +76,18 @@ async def lifespan(app: FastAPI):
     Loading models here (not at import time) means startup errors are visible
     in the server log, not buried in a traceback from a module-level call.
     """
-    global _ingestion_service, _retrieval_service, _generation_service
     print("Starting up — loading models and connecting to Qdrant...")
     _ingestion_service = IngestionService()
     _retrieval_service = RetrievalService()
     _generation_service = GenerationService()
     print("All services ready.\n")
     yield
     # Cleanup on shutdown (if needed) goes here
@@ -130,6 +142,14 @@ def get_generation_service() -> GenerationService:
         raise RuntimeError("GenerationService not initialised")
     return _generation_service
 # ── Routes: Ingestion ──────────────────────────────────────────────────────────
@@ -319,6 +339,108 @@ async def query_stream(
     return StreamingResponse(token_stream(), media_type="text/event-stream")
 # ── Health check ───────────────────────────────────────────────────────────────
 @app.get("/health", tags=["meta"])

   POST /search              — retrieve chunks (no generation)
   POST /query               — RAG: retrieve + generate answer
   GET  /query/stream        — RAG with streaming SSE response
+  POST /agent/query         — Agentic RAG: ReAct loop (synchronous)
+  GET  /agent/stream        — Agentic RAG: ReAct loop with SSE progress stream
 """
 from contextlib import asynccontextmanager
     SearchRequest, SearchResponse, CodeChunk,
     QueryRequest, QueryResponse,
     ReposResponse, RepoInfo,
+    AgentRequest, AgentResponse, AgentToolCall,
 )
 from backend.config import settings
 from backend.services.ingestion_service import IngestionService
 from backend.services.generation import GenerationService, classify_query
+from backend.services.agent import AgentService
 from retrieval.retrieval import RetrievalService
 _ingestion_service: IngestionService | None = None
 _retrieval_service: RetrievalService | None = None
 _generation_service: GenerationService | None = None
+_agent_service: AgentService | None = None
 @asynccontextmanager
     Loading models here (not at import time) means startup errors are visible
     in the server log, not buried in a traceback from a module-level call.
     """
+    global _ingestion_service, _retrieval_service, _generation_service, _agent_service
     print("Starting up — loading models and connecting to Qdrant...")
     _ingestion_service = IngestionService()
     _retrieval_service = RetrievalService()
     _generation_service = GenerationService()
+    # AgentService is optional — only initialised when ANTHROPIC_API_KEY is set.
+    # If no key, the /agent/* endpoints return a clear error rather than crashing.
+    if settings.anthropic_api_key:
+        _agent_service = AgentService(_retrieval_service)
+        print("AgentService ready (agentic RAG enabled).")
+    else:
+        print("No ANTHROPIC_API_KEY — /agent/* endpoints disabled.")
     print("All services ready.\n")
     yield
     # Cleanup on shutdown (if needed) goes here
         raise RuntimeError("GenerationService not initialised")
     return _generation_service
+def get_agent_service() -> AgentService:
+    if _agent_service is None:
+        raise HTTPException(
+            status_code=503,
+            detail="Agentic RAG requires ANTHROPIC_API_KEY — not configured on this server.",
+        )
+    return _agent_service
 # ── Routes: Ingestion ──────────────────────────────────────────────────────────
     return StreamingResponse(token_stream(), media_type="text/event-stream")
+# ── Routes: Agentic RAG ────────────────────────────────────────────────────────
+#
+# These endpoints wrap AgentService, which runs a ReAct loop:
+#   question → think → search → observe → think → search → ... → answer
+#
+# Why two endpoints?
+#   POST /agent/query  — synchronous. Wait for the full answer, return JSON.
+#                        Simple to integrate, but slow (the whole loop runs first).
+#   GET  /agent/stream — streaming SSE. Watch the agent's thinking in real time.
+#                        Shows each tool call as it happens (like watching AI think).
+#
+# The streaming endpoint is the "wow" version — users can see the agent reasoning
+# live: "Searching for backward()... found engine.py... now looking for callers..."
+@app.post("/agent/query", response_model=AgentResponse, tags=["agent"])
+async def agent_query(
+    request: AgentRequest,
+    agent_svc: Annotated[AgentService, Depends(get_agent_service)],
+):
+    """
+    Run the agentic RAG loop synchronously.
+    The agent searches the codebase multiple times, from different angles,
+    until it has enough evidence to answer confidently. Returns the full
+    reasoning trace (tool_calls) alongside the answer.
+    Slower than /query but more thorough — the agent decides what to search,
+    not a fixed single retrieval. Best for complex multi-hop questions like
+    "how does the training loop interact with the optimizer?" that require
+    understanding how multiple pieces connect.
+    """
+    try:
+        result = agent_svc.run(request.question, repo_filter=request.repo)
+        return AgentResponse(
+            answer=result["answer"],
+            tool_calls=[AgentToolCall(**tc) for tc in result["tool_calls"]],
+            iterations=result["iterations"],
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Agent error: {e}")
+@app.get("/agent/stream", tags=["agent"])
+async def agent_stream(
+    question: Annotated[str, Query(description="Question about the codebase")],
+    agent_svc: Annotated[AgentService, Depends(get_agent_service)],
+    repo: str | None = None,
+):
+    """
+    Run the agentic RAG loop with real-time SSE progress streaming.
+    Unlike /query/stream (which just streams tokens), this endpoint lets you
+    watch the agent's full reasoning process as it happens:
+      event: tool_call   → agent is about to call a tool (shows name + args)
+      event: tool_result → tool returned, agent is reading the result
+      (default event)    → text token of the final answer
+      event: done        → agent finished (includes iteration count)
+    This is the "glass box" view of the agent — users can see exactly what
+    it searched for and what it found, not just the final answer. Critical
+    for trust and debugging in production RAG systems.
+    SSE event format for each type:
+      event: tool_call
+      data: {"tool": "search_code", "input": {"query": "backward pass"}}
+      event: tool_result
+      data: {"tool": "search_code", "output": "Source 1: engine.py..."}
+      (default)
+      data: According to the code...
+      event: done
+      data: {"iterations": 3}
+    """
+    import json
+    def event_stream():
+        for event in agent_svc.stream(question, repo_filter=repo):
+            etype = event["type"]
+            if etype == "tool_call":
+                payload = json.dumps({"tool": event["tool"], "input": event["input"]})
+                yield f"event: tool_call\ndata: {payload}\n\n"
+            elif etype == "tool_result":
+                payload = json.dumps({"tool": event["tool"], "output": event["output"]})
+                yield f"event: tool_result\ndata: {payload}\n\n"
+            elif etype == "token":
+                safe = event["text"].replace("\n", "\\n")
+                yield f"data: {safe}\n\n"
+            elif etype == "done":
+                payload = json.dumps({"iterations": event["iterations"]})
+                yield f"event: done\ndata: {payload}\n\n"
+                yield "data: [DONE]\n\n"
+    return StreamingResponse(event_stream(), media_type="text/event-stream")
 # ── Health check ───────────────────────────────────────────────────────────────
 @app.get("/health", tags=["meta"])

backend/models/schemas.py CHANGED Viewed

@@ -133,3 +133,56 @@ class ReposResponse(BaseModel):
     """Response from GET /repos — list all indexed repos."""
     repos: list[RepoInfo]
     total_chunks: int

     """Response from GET /repos — list all indexed repos."""
     repos: list[RepoInfo]
     total_chunks: int
+# ── Agent (Agentic RAG) ───────────────────────────────────────────────────────
+#
+# These schemas describe the agent's inputs and outputs.
+#
+# The key difference from plain RAG:
+#   Plain RAG: one retrieval → one answer (deterministic, fast)
+#   Agentic RAG: N retrievals → N observations → one answer (adaptive, slower)
+#
+# The tool_calls list is the "trace" — a record of every search the agent made
+# and what it found. This is the crucial insight that makes agents explainable:
+# you can see exactly WHY the agent answered what it did, step by step.
+class AgentToolCall(BaseModel):
+    """
+    A single tool call made by the agent during its ReAct loop.
+    This is one step in the agent's reasoning trace:
+      - tool:   which tool it called (search_code, get_file_chunk, find_callers)
+      - input:  what arguments it passed (shows WHAT it was looking for)
+      - output: truncated result (shows WHAT it found)
+    The sequence of these calls tells the story of the agent's reasoning.
+    """
+    tool:   str   # tool name
+    input:  dict  # arguments passed to the tool
+    output: str   # first 500 chars of the result (truncated for display)
+class AgentRequest(BaseModel):
+    """Request body for POST /agent/query — run the agentic RAG loop."""
+    question: str = Field(..., description="Question about the codebase")
+    repo: Optional[str] = Field(
+        default=None,
+        description="Restrict search to a specific repo slug (e.g. 'karpathy/micrograd')",
+    )
+class AgentResponse(BaseModel):
+    """
+    Response from POST /agent/query.
+    In addition to the answer, we return:
+      - tool_calls: the agent's full reasoning trace (what it searched + found)
+      - iterations: how many ReAct steps it took (capped at MAX_ITERATIONS=8)
+    This transparency is intentional — it shows users HOW the agent reasoned,
+    not just what it concluded. Makes debugging and trust much easier.
+    """
+    answer:     str
+    tool_calls: list[AgentToolCall]
+    iterations: int

backend/services/agent.py ADDED Viewed

	@@ -0,0 +1,400 @@

+"""
+agent.py — Agentic RAG using Anthropic tool use.
+═══════════════════════════════════════════════════════════════
+WHAT IS AN AGENT? (vs plain RAG)
+═══════════════════════════════════════════════════════════════
+Plain RAG (what we had before):
+  Query → single retrieval → LLM → answer
+  The problem: one retrieval step may miss critical context.
+  "How does the training loop work?" retrieves train() but misses
+  the DataLoader, the gradient accumulation, the optimizer step.
+  One shot, then done.
+Agentic RAG (what we're building):
+  Query → think → search → observe → think → search → observe → answer
+  The LLM DECIDES when it has enough information.
+  It can call tools multiple times, from different angles,
+  until it's confident in its answer.
+This is called a ReAct loop (Reason + Act):
+  1. REASON: "I need to find the backward() implementation"
+  2. ACT:    call search_code("backward implementation")
+  3. OBSERVE: "I see relu._backward, but not the main backward()"
+  4. REASON: "Let me search specifically for Value.backward"
+  5. ACT:    call find_callers("backward")
+  6. OBSERVE: "Found it — it does topological sort first"
+  7. REASON: "I have enough to answer"
+  8. RESPOND: full answer with citations
+═══════════════════════════════════════════════════════════════
+HOW ANTHROPIC TOOL USE WORKS
+═══════════════════════════════════════════════════════════════
+Normal message:
+  You → [message] → Claude → [answer text]
+With tools:
+  You → [message + tool_definitions] → Claude
+    → either: [answer text]   (done, no tools needed)
+    → or:     [tool_use block] (Claude wants to call a tool)
+  You run the tool → [tool_result] → Claude
+    → either: [answer text]
+    → or:     [another tool_use block]
+  ... repeat until Claude returns text
+The conversation history grows:
+  messages = [
+    {"role": "user",      "content": "How does backward() work?"},
+    {"role": "assistant", "content": [{"type": "tool_use", "name": "search_code", ...}]},
+    {"role": "user",      "content": [{"type": "tool_result", "tool_use_id": "...", "content": "..."}]},
+    {"role": "assistant", "content": [{"type": "tool_use", "name": "find_callers", ...}]},
+    {"role": "user",      "content": [{"type": "tool_result", "tool_use_id": "...", "content": "..."}]},
+    {"role": "assistant", "content": "According to Source 4, backward() works by..."},
+  ]
+The key insight: tool results are fed back as "user" messages.
+The model never "runs" the tool — YOU do, and report back.
+═══════════════════════════════════════════════════════════════
+STOPPING CONDITIONS
+═══════════════════════════════════════════════════════════════
+The loop ends when:
+  1. Claude returns stop_reason="end_turn" (it's satisfied)
+  2. We hit max_iterations (safety cap — prevents infinite loops)
+  3. Claude returns text with no tool calls (it has its answer)
+We cap at 8 iterations. Each iteration is one Claude API call + one
+tool execution. This bounds cost and latency while allowing real
+multi-hop reasoning (most questions need 2–4 hops).
+"""
+import json
+from pathlib import Path
+from typing import Iterator
+import sys
+import requests as http_requests
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+from backend.config import settings
+from retrieval.retrieval import RetrievalService
+# ── Tool definitions (Anthropic format) ───────────────────────────────────────
+# These are the same tools as the MCP server but defined in Anthropic's
+# tool schema format. Same capabilities, different wire format.
+#
+# Notice the pattern: name, description (LLM reads this!), input_schema.
+# The description tells the LLM WHEN to use the tool. Write it like a
+# docstring for the model's benefit, not yours.
+TOOLS = [
+    {
+        "name": "search_code",
+        "description": (
+            "Search the indexed GitHub repositories for code relevant to a query. "
+            "Uses hybrid BM25 + semantic search. Returns ranked code chunks with "
+            "file paths, function names, and line numbers. "
+            "Call this first when answering any question about the codebase. "
+            "You can call it multiple times with different queries to explore different aspects."
+        ),
+        "input_schema": {
+            "type": "object",
+            "properties": {
+                "query": {"type": "string", "description": "What to search for"},
+                "repo":  {"type": "string", "description": "Optional: 'owner/repo' to restrict search"},
+                "mode":  {
+                    "type": "string",
+                    "enum": ["hybrid", "semantic", "keyword"],
+                    "description": "hybrid=default, keyword=exact identifiers, semantic=concepts",
+                },
+                "top_k": {"type": "integer", "description": "Number of results (default 5)"},
+            },
+            "required": ["query"],
+        },
+    },
+    {
+        "name": "get_file_chunk",
+        "description": (
+            "Fetch the raw content of a specific section of a file from GitHub. "
+            "Use this when a search result shows a function but you need more context: "
+            "the lines above (docstring, decorators) or below (what comes after). "
+            "Also useful to see the full class when search only returned one method."
+        ),
+        "input_schema": {
+            "type": "object",
+            "properties": {
+                "repo":       {"type": "string", "description": "'owner/repo'"},
+                "filepath":   {"type": "string", "description": "path within the repo"},
+                "start_line": {"type": "integer"},
+                "end_line":   {"type": "integer"},
+            },
+            "required": ["repo", "filepath", "start_line", "end_line"],
+        },
+    },
+    {
+        "name": "find_callers",
+        "description": (
+            "Find all places in the codebase that call a specific function or class. "
+            "Essential for understanding HOW something is used, not just what it does. "
+            "Example: after finding the definition of Value.__mul__, call find_callers "
+            "to see where multiplication is actually performed in training code."
+        ),
+        "input_schema": {
+            "type": "object",
+            "properties": {
+                "function_name": {"type": "string"},
+                "repo":          {"type": "string", "description": "Optional: restrict to one repo"},
+            },
+            "required": ["function_name"],
+        },
+    },
+]
+class AgentService:
+    """
+    Runs a ReAct (Reason + Act) loop using Anthropic tool use.
+    The agent has access to three tools: search_code, get_file_chunk, find_callers.
+    It runs until either it produces an answer or hits max_iterations.
+    Each call to `run()` returns a structured result including:
+    - The final answer
+    - The tool call trace (what it searched, what it found)
+    - The sources actually used in the answer
+    """
+    MAX_ITERATIONS = 8
+    SYSTEM_PROMPT = """You are an expert code assistant with access to a searchable index of GitHub repositories.
+When answering questions about code:
+1. Start by calling search_code to find relevant code
+2. If the initial results don't fully answer the question, search again with a different query
+3. Use get_file_chunk to see more context around a result (e.g., the full class or surrounding code)
+4. Use find_callers to understand how functions are used, not just defined
+5. Only answer when you have enough evidence from the actual code
+Always cite your sources: mention the file path and line numbers.
+Be precise — if the code doesn't show what you're looking for, say so rather than guessing."""
+    def __init__(self, retrieval_service: RetrievalService):
+        self.retrieval = retrieval_service
+        if not settings.anthropic_api_key:
+            raise ValueError("ANTHROPIC_API_KEY required for agentic queries")
+        import anthropic
+        self._client = anthropic.Anthropic(api_key=settings.anthropic_api_key)
+    def run(self, question: str, repo_filter: str | None = None) -> dict:
+        """
+        Run the agent loop synchronously.
+        Returns:
+            {
+                "answer":     str,           # final LLM answer
+                "tool_calls": list[dict],    # trace: [{tool, input, output}, ...]
+                "iterations": int,           # how many reasoning steps it took
+            }
+        """
+        # The conversation starts with just the user question.
+        # Tool results will be appended as the loop progresses.
+        messages = [{"role": "user", "content": question}]
+        # If the user selected a specific repo, hint the agent
+        if repo_filter:
+            messages[0]["content"] += f"\n\n(Search in repo: {repo_filter})"
+        tool_trace = []
+        for iteration in range(self.MAX_ITERATIONS):
+            # ── Ask Claude (with tools available) ─────────────────────────────
+            response = self._client.messages.create(
+                model="claude-haiku-4-5-20251001",
+                max_tokens=2048,
+                system=self.SYSTEM_PROMPT,
+                tools=TOOLS,
+                messages=messages,
+            )
+            # ── Did Claude give us a final answer? ────────────────────────────
+            # stop_reason="end_turn" means Claude is done — no more tool calls.
+            if response.stop_reason == "end_turn":
+                answer = ""
+                for block in response.content:
+                    if hasattr(block, "text"):
+                        answer += block.text
+                return {
+                    "answer":     answer,
+                    "tool_calls": tool_trace,
+                    "iterations": iteration + 1,
+                }
+            # ── Claude wants to call tools ────────────────────────────────────
+            # The response content may have multiple blocks:
+            # - TextContent blocks (thinking out loud)
+            # - ToolUseContent blocks (actual tool calls)
+            # Append Claude's response to the conversation history
+            messages.append({"role": "assistant", "content": response.content})
+            # Process each tool call
+            tool_results = []
+            for block in response.content:
+                if block.type != "tool_use":
+                    continue
+                tool_name   = block.name
+                tool_input  = block.input
+                tool_use_id = block.id
+                # ── Execute the tool ──────────────────────────────────────────
+                try:
+                    result = self._execute_tool(tool_name, tool_input)
+                except Exception as e:
+                    result = f"Tool error: {e}"
+                # Record for the trace
+                tool_trace.append({
+                    "tool":   tool_name,
+                    "input":  tool_input,
+                    "output": result[:500] + "..." if len(result) > 500 else result,
+                })
+                # ── Build the tool_result message ─────────────────────────────
+                # This goes back to Claude as a "user" turn.
+                # Claude reads these results and decides what to do next.
+                tool_results.append({
+                    "type":        "tool_result",
+                    "tool_use_id": tool_use_id,
+                    "content":     result,
+                })
+            # Add all tool results to the conversation
+            messages.append({"role": "user", "content": tool_results})
+        # Hit max iterations — return what we have
+        return {
+            "answer":     "I was unable to fully answer this question within the allowed reasoning steps.",
+            "tool_calls": tool_trace,
+            "iterations": self.MAX_ITERATIONS,
+        }
+    def stream(self, question: str, repo_filter: str | None = None) -> Iterator[dict]:
+        """
+        Stream agent progress as it happens.
+        Yields dicts with type:
+          {"type": "tool_call",  "tool": "search_code", "input": {...}}
+          {"type": "tool_result","tool": "search_code", "output": "..."}
+          {"type": "token",      "text": "According..."}
+          {"type": "done",       "iterations": 3}
+        """
+        messages = [{"role": "user", "content": question}]
+        if repo_filter:
+            messages[0]["content"] += f"\n\n(Search in repo: {repo_filter})"
+        for iteration in range(self.MAX_ITERATIONS):
+            response = self._client.messages.create(
+                model="claude-haiku-4-5-20251001",
+                max_tokens=2048,
+                system=self.SYSTEM_PROMPT,
+                tools=TOOLS,
+                messages=messages,
+            )
+            if response.stop_reason == "end_turn":
+                # Stream the final answer token by token
+                for block in response.content:
+                    if hasattr(block, "text"):
+                        # Yield word-by-word for a streaming feel
+                        for word in block.text.split(" "):
+                            yield {"type": "token", "text": word + " "}
+                yield {"type": "done", "iterations": iteration + 1}
+                return
+            messages.append({"role": "assistant", "content": response.content})
+            tool_results = []
+            for block in response.content:
+                if block.type != "tool_use":
+                    continue
+                yield {"type": "tool_call", "tool": block.name, "input": block.input}
+                try:
+                    result = self._execute_tool(block.name, block.input)
+                except Exception as e:
+                    result = f"Tool error: {e}"
+                yield {"type": "tool_result", "tool": block.name, "output": result[:300]}
+                tool_results.append({
+                    "type":        "tool_result",
+                    "tool_use_id": block.id,
+                    "content":     result,
+                })
+            messages.append({"role": "user", "content": tool_results})
+        yield {"type": "done", "iterations": self.MAX_ITERATIONS}
+    # ── Tool execution ─────────────────────────────────────────────────────────
+    def _execute_tool(self, name: str, args: dict) -> str:
+        if name == "search_code":
+            return self._tool_search_code(args)
+        elif name == "get_file_chunk":
+            return self._tool_get_file_chunk(args)
+        elif name == "find_callers":
+            return self._tool_find_callers(args)
+        return f"Unknown tool: {name}"
+    def _tool_search_code(self, args: dict) -> str:
+        results = self.retrieval.search(
+            query=args["query"],
+            top_k=args.get("top_k", 5),
+            repo_filter=args.get("repo"),
+            mode=args.get("mode", "hybrid"),
+        )
+        if not results:
+            return "No results found."
+        return self.retrieval.format_context(results)
+    def _tool_get_file_chunk(self, args: dict) -> str:
+        repo     = args["repo"]
+        filepath = args["filepath"]
+        start    = args["start_line"]
+        end      = args["end_line"]
+        owner, name = repo.split("/", 1)
+        url = f"https://api.github.com/repos/{owner}/{name}/contents/{filepath}"
+        headers = {"Accept": "application/vnd.github.v3.raw"}
+        if settings.github_token:
+            headers["Authorization"] = f"token {settings.github_token}"
+        resp = http_requests.get(url, headers=headers, timeout=15)
+        if resp.status_code == 404:
+            return f"File not found: {filepath}"
+        resp.raise_for_status()
+        lines = resp.text.splitlines()
+        start = max(1, start)
+        end   = min(len(lines), end)
+        chunk = "\n".join(f"{i+start}: {line}" for i, line in enumerate(lines[start-1:end]))
+        return f"# {repo} — {filepath} (lines {start}–{end})\n\n{chunk}"
+    def _tool_find_callers(self, args: dict) -> str:
+        name = args["function_name"]
+        results = self.retrieval.search(
+            query=name,
+            top_k=8,
+            repo_filter=args.get("repo"),
+            mode="keyword",
+        )
+        callers = [r for r in results if name in r["text"]]
+        if not callers:
+            return f"No call sites found for '{name}'."
+        return self.retrieval.format_context(callers)

mcp_server/__init__.py ADDED Viewed

File without changes

mcp_server/server.py ADDED Viewed

	@@ -0,0 +1,499 @@

+"""
+mcp_server/server.py — Our GitHub RAG Copilot as an MCP server.
+═══════════════════════════════════════════════════════════════
+WHAT IS MCP?
+═══════════════════════════════════════════════════════════════
+MCP (Model Context Protocol) is an open standard created by Anthropic that
+defines HOW AI models connect to external tools and data sources.
+Think of it like USB-C for AI:
+  Before USB-C, every device had a different charging port.
+  Before MCP, every AI application built its own custom tool integration.
+With MCP:
+  - You build a server ONCE exposing your capabilities
+  - ANY MCP client (Claude Desktop, Cursor, your custom app) can use it
+  - The AI model gets a consistent interface regardless of the tool
+Without MCP (what we had before):
+  our_app → hardcoded API calls → specific tools
+With MCP:
+  our_app ←→ MCP protocol ←→ ANY tools
+  Claude Desktop ←→ MCP protocol ←→ our RAG server
+  Cursor ←→ MCP protocol ←→ our RAG server
+═══════════════════════════════════════════════════════════════
+MCP'S THREE PRIMITIVES
+═══════════════════════════════════════════════════════════════
+MCP defines exactly three things a server can expose:
+  1. TOOLS      — functions the LLM can call
+                  "search for code", "read a file", "run a query"
+                  → LLM decides when to call them (autonomous)
+  2. RESOURCES  — data the LLM can read (like files or DB records)
+                  "here is the list of indexed repos"
+                  → Client controls when to read them (not LLM)
+  3. PROMPTS    — reusable prompt templates with arguments
+                  "explain this function: {code}"
+                  → User triggers these (shown as slash commands in Claude Desktop)
+Each primitive has a different actor:
+  Tools     → LLM-driven  (model decides to call them mid-reasoning)
+  Resources → Client-driven (app fetches them at context-building time)
+  Prompts   → User-driven  (user picks them from a menu)
+═══════════════════════════════════════════════════════════════
+TWO TRANSPORT MODES
+═══════════════════════════════════════════════════════════════
+MCP servers communicate over one of two transports:
+  STDIO (standard input/output):
+    - Claude Desktop spawns your server as a subprocess
+    - Communication happens over stdin/stdout pipes
+    - Simpler, no network configuration needed
+    - Best for: local tools, Claude Desktop integration
+  HTTP + SSE (Server-Sent Events):
+    - Your server runs as a web service
+    - LLM connects over the network
+    - Supports multiple concurrent clients
+    - Best for: deployed services, shared team tools
+This server supports BOTH — stdio for local dev, HTTP for production.
+═══════════════════════════════════════════════════════════════
+TOOLS WE EXPOSE
+═══════════════════════════════════════════════════════════════
+  search_code(query, repo?, language?, mode?, top_k?)
+    → Hybrid BM25 + semantic search over indexed repos
+    → Returns ranked code chunks with filepath + line numbers
+  list_repos()
+    → Returns all repos currently in the index
+  get_file_chunk(repo, filepath, start_line, end_line)
+    → Fetches a specific range of lines from GitHub
+    → Used for follow-up: "show me more of that function"
+  find_callers(function_name, repo)
+    → Searches for all call sites of a function
+    → Enables "who calls this?" multi-hop reasoning
+"""
+from pathlib import Path
+import sys
+import json
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from mcp.server import Server
+from mcp.server.stdio import stdio_server
+from mcp import types
+from retrieval.retrieval import RetrievalService
+from ingestion.qdrant_store import QdrantStore
+from ingestion.repo_fetcher import fetch_repo_files, parse_github_url
+from backend.config import settings
+# ── Server init ─────────────────────────────────────────────���─────────────────
+# The Server object is the MCP server. It handles the protocol:
+# - responding to tool/resource/prompt list requests
+# - dispatching tool calls to our handlers
+# - serialising results back in MCP format
+app = Server("github-rag-copilot")
+# Services loaded once — same pattern as FastAPI lifespan
+_retrieval: RetrievalService | None = None
+_store: QdrantStore | None = None
+def get_retrieval() -> RetrievalService:
+    global _retrieval
+    if _retrieval is None:
+        _retrieval = RetrievalService()
+    return _retrieval
+def get_store() -> QdrantStore:
+    global _store
+    if _store is None:
+        _store = QdrantStore()
+    return _store
+# ══════════════════════════════════════════════════════════════════════════════
+# TOOLS
+# Every tool needs:
+#   1. A name (what the LLM calls)
+#   2. A description (what the LLM reads to decide whether to call it)
+#   3. An inputSchema (JSON Schema — the LLM fills this in)
+#   4. A handler (@app.call_tool)
+#
+# The description is CRITICAL — it's the only thing the LLM reads when
+# deciding which tool to use. Write it like documentation for a smart
+# person who can't see your code.
+# ══════════════════════════════════════════════════════════════════════════════
+@app.list_tools()
+async def list_tools() -> list[types.Tool]:
+    """Called by MCP clients to discover what this server can do."""
+    return [
+        types.Tool(
+            name="search_code",
+            description=(
+                "Search for code chunks relevant to a query using hybrid BM25 + semantic search. "
+                "Returns ranked code snippets with file paths and line numbers. "
+                "Use this to find function definitions, class implementations, usage examples, "
+                "or any code related to a concept. "
+                "Specify repo to restrict search to a single repository."
+            ),
+            inputSchema={
+                "type": "object",
+                "properties": {
+                    "query": {
+                        "type": "string",
+                        "description": "Natural language question or code identifier to search for",
+                    },
+                    "repo": {
+                        "type": "string",
+                        "description": "Optional: restrict to a repo slug like 'karpathy/micrograd'",
+                    },
+                    "language": {
+                        "type": "string",
+                        "description": "Optional: filter by language like 'python', 'typescript'",
+                    },
+                    "mode": {
+                        "type": "string",
+                        "enum": ["hybrid", "semantic", "keyword"],
+                        "description": "Search strategy. hybrid (default) combines semantic + BM25.",
+                    },
+                    "top_k": {
+                        "type": "integer",
+                        "description": "Number of results to return (default 5)",
+                    },
+                },
+                "required": ["query"],
+            },
+        ),
+        types.Tool(
+            name="list_repos",
+            description=(
+                "List all GitHub repositories currently indexed and available for search. "
+                "Returns repo slugs (owner/name) and chunk counts. "
+                "Call this first to know which repos are available before searching."
+            ),
+            inputSchema={
+                "type": "object",
+                "properties": {},
+                "required": [],
+            },
+        ),
+        types.Tool(
+            name="get_file_chunk",
+            description=(
+                "Fetch the raw content of a specific file section from GitHub. "
+                "Use this to see more context around a search result — for example, "
+                "if search returns lines 45–52 but you need the full function including "
+                "its docstring at lines 38–44. "
+                "Requires the repo to be publicly accessible on GitHub."
+            ),
+            inputSchema={
+                "type": "object",
+                "properties": {
+                    "repo": {
+                        "type": "string",
+                        "description": "Repository slug like 'karpathy/micrograd'",
+                    },
+                    "filepath": {
+                        "type": "string",
+                        "description": "File path within the repo like 'micrograd/engine.py'",
+                    },
+                    "start_line": {
+                        "type": "integer",
+                        "description": "First line to fetch (1-indexed)",
+                    },
+                    "end_line": {
+                        "type": "integer",
+                        "description": "Last line to fetch (inclusive)",
+                    },
+                },
+                "required": ["repo", "filepath", "start_line", "end_line"],
+            },
+        ),
+        types.Tool(
+            name="find_callers",
+            description=(
+                "Find all places in the indexed code that call a specific function or class. "
+                "Use this for multi-hop reasoning: after finding a function definition, "
+                "call this to understand how it's used and in what context. "
+                "Returns code chunks containing calls to the specified name."
+            ),
+            inputSchema={
+                "type": "object",
+                "properties": {
+                    "function_name": {
+                        "type": "string",
+                        "description": "Function or class name to search for call sites",
+                    },
+                    "repo": {
+                        "type": "string",
+                        "description": "Optional: restrict to a specific repository",
+                    },
+                },
+                "required": ["function_name"],
+            },
+        ),
+    ]
+# ══════════════════════════════════════════════════════════════════════════════
+# TOOL HANDLERS
+# @app.call_tool() receives the tool name + arguments from the LLM.
+# Returns a list of content blocks (text, image, or resource).
+# ══════════════════════════════════════════════════════════════════════════════
+@app.call_tool()
+async def call_tool(name: str, arguments: dict) -> list[types.TextContent]:
+    """Route tool calls to the appropriate handler."""
+    if name == "search_code":
+        return await _handle_search_code(arguments)
+    elif name == "list_repos":
+        return await _handle_list_repos(arguments)
+    elif name == "get_file_chunk":
+        return await _handle_get_file_chunk(arguments)
+    elif name == "find_callers":
+        return await _handle_find_callers(arguments)
+    else:
+        return [types.TextContent(type="text", text=f"Unknown tool: {name}")]
+async def _handle_search_code(args: dict) -> list[types.TextContent]:
+    retrieval = get_retrieval()
+    results = retrieval.search(
+        query=args["query"],
+        top_k=args.get("top_k", 5),
+        repo_filter=args.get("repo"),
+        language_filter=args.get("language"),
+        mode=args.get("mode", "hybrid"),
+    )
+    if not results:
+        return [types.TextContent(type="text", text="No results found.")]
+    return [types.TextContent(
+        type="text",
+        text=retrieval.format_context(results),
+    )]
+async def _handle_list_repos(args: dict) -> list[types.TextContent]:
+    store = get_store()
+    repos = store.list_repos()
+    if not repos:
+        return [types.TextContent(type="text", text="No repositories indexed yet.")]
+    lines = [f"- {slug} ({store.count(repo=slug)} chunks)" for slug in repos]
+    return [types.TextContent(type="text", text="Indexed repositories:\n" + "\n".join(lines))]
+async def _handle_get_file_chunk(args: dict) -> list[types.TextContent]:
+    """Fetch a file from GitHub and return the requested line range."""
+    import requests
+    repo     = args["repo"]
+    filepath = args["filepath"]
+    start    = args["start_line"]
+    end      = args["end_line"]
+    owner, name = repo.split("/", 1)
+    url = f"https://api.github.com/repos/{owner}/{name}/contents/{filepath}"
+    headers = {"Accept": "application/vnd.github.v3.raw"}
+    if settings.github_token:
+        headers["Authorization"] = f"token {settings.github_token}"
+    response = requests.get(url, headers=headers, timeout=15)
+    if response.status_code == 404:
+        return [types.TextContent(type="text", text=f"File not found: {filepath}")]
+    response.raise_for_status()
+    lines = response.text.splitlines()
+    # Clamp to actual file length
+    start = max(1, start)
+    end   = min(len(lines), end)
+    chunk = "\n".join(f"{i+start}: {line}" for i, line in enumerate(lines[start-1:end]))
+    return [types.TextContent(
+        type="text",
+        text=f"# {repo} — {filepath} (lines {start}–{end})\n\n{chunk}",
+    )]
+async def _handle_find_callers(args: dict) -> list[types.TextContent]:
+    """Find call sites by keyword-searching for the function name."""
+    retrieval = get_retrieval()
+    # Keyword mode is best for exact identifier matching
+    results = retrieval.search(
+        query=args["function_name"],
+        top_k=8,
+        repo_filter=args.get("repo"),
+        mode="keyword",
+    )
+    # Filter to chunks that actually contain the name (keyword search may return
+    # chunks that share tokens with the name)
+    name = args["function_name"]
+    callers = [r for r in results if name in r["text"]]
+    if not callers:
+        return [types.TextContent(type="text", text=f"No call sites found for '{name}'.")]
+    return [types.TextContent(
+        type="text",
+        text=retrieval.format_context(callers),
+    )]
+# ══════════════════════════════════════════════════════════════════════════════
+# RESOURCES
+# Resources are read-only data the LLM (or client) can browse.
+# Unlike tools (LLM calls them to act), resources are like open tabs —
+# the client can read them to build up context.
+#
+# We expose each indexed repo as a resource with a custom URI scheme:
+#   rag://repos/karpathy/micrograd
+# ══════════════════════════════════════════════════════════════════════════════
+@app.list_resources()
+async def list_resources() -> list[types.Resource]:
+    """Expose indexed repos as browsable resources."""
+    store = get_store()
+    repos = store.list_repos()
+    return [
+        types.Resource(
+            uri=f"rag://repos/{slug}",
+            name=slug,
+            description=f"Indexed code from {slug} ({store.count(repo=slug)} chunks)",
+            mimeType="text/plain",
+        )
+        for slug in repos
+    ]
+@app.read_resource()
+async def read_resource(uri: str) -> str:
+    """Return a summary for a repo resource."""
+    # Parse rag://repos/owner/name
+    slug = uri.removeprefix("rag://repos/")
+    store = get_store()
+    count = store.count(repo=slug)
+    return f"Repository: {slug}\nIndexed chunks: {count}\n\nUse the search_code tool to query this repo."
+# ══════════════════════════════════════════════════════════════════════════════
+# PROMPTS
+# Prompts are reusable templates shown to the user as slash commands in
+# Claude Desktop. The user picks a prompt, fills in arguments, and Claude
+# executes it with the template expanded.
+#
+# This is different from tools (which the LLM calls) and resources (which
+# the client reads). Prompts are USER-INITIATED templates.
+# ══════════════════════════════════════════════════════════════════════════════
+@app.list_prompts()
+async def list_prompts() -> list[types.Prompt]:
+    return [
+        types.Prompt(
+            name="explain-function",
+            description="Retrieve and explain a specific function from the indexed repos",
+            arguments=[
+                types.PromptArgument(
+                    name="function_name",
+                    description="Name of the function to explain",
+                    required=True,
+                ),
+                types.PromptArgument(
+                    name="repo",
+                    description="Repository slug (optional, e.g. karpathy/micrograd)",
+                    required=False,
+                ),
+            ],
+        ),
+        types.Prompt(
+            name="repo-overview",
+            description="Generate an architectural overview of an indexed repository",
+            arguments=[
+                types.PromptArgument(
+                    name="repo",
+                    description="Repository slug like 'karpathy/micrograd'",
+                    required=True,
+                ),
+            ],
+        ),
+    ]
+@app.get_prompt()
+async def get_prompt(name: str, arguments: dict) -> types.GetPromptResult:
+    if name == "explain-function":
+        fn   = arguments.get("function_name", "")
+        repo = arguments.get("repo", "")
+        repo_clause = f" in {repo}" if repo else ""
+        return types.GetPromptResult(
+            description=f"Explain {fn}",
+            messages=[
+                types.PromptMessage(
+                    role="user",
+                    content=types.TextContent(
+                        type="text",
+                        text=(
+                            f"Use the search_code tool to find the implementation of `{fn}`{repo_clause}. "
+                            f"Then explain what it does, its parameters, return value, and any important "
+                            f"implementation details. Cite the source file and line numbers."
+                        ),
+                    ),
+                )
+            ],
+        )
+    elif name == "repo-overview":
+        repo = arguments.get("repo", "")
+        return types.GetPromptResult(
+            description=f"Overview of {repo}",
+            messages=[
+                types.PromptMessage(
+                    role="user",
+                    content=types.TextContent(
+                        type="text",
+                        text=(
+                            f"Use search_code with repo='{repo}' to explore the codebase. "
+                            f"Search for: main entry points, core data structures, key abstractions. "
+                            f"Then write a structured architectural overview covering: "
+                            f"1) What the project does, 2) Main modules and their responsibilities, "
+                            f"3) Key data flow, 4) Important design patterns used."
+                        ),
+                    ),
+                )
+            ],
+        )
+    raise ValueError(f"Unknown prompt: {name}")
+# ══════════════════════════════════════════════════════════════════════════════
+# ENTRY POINT
+# Run as stdio server (for Claude Desktop) or imported for HTTP mode.
+# ══════════════════════════════════════════════════════════════════════════════
+async def main():
+    """Run the MCP server over stdio (for Claude Desktop integration)."""
+    print("Starting GitHub RAG MCP server...", flush=True)
+    async with stdio_server() as (read_stream, write_stream):
+        await app.run(
+            read_stream,
+            write_stream,
+            app.create_initialization_options(),
+        )
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(main())

requirements.txt CHANGED Viewed

@@ -24,3 +24,4 @@ anthropic
 # Utilities
 python-dotenv
 pydantic

 # Utilities
 python-dotenv
 pydantic
+mcp

ui/src/App.jsx CHANGED Viewed

@@ -1,12 +1,13 @@
 import { useState, useEffect, useRef, useCallback } from "react";
 import Sidebar from "./components/Sidebar";
 import Message from "./components/Message";
-import { fetchRepos, streamQuery } from "./api";
 export default function App() {
   const [repos, setRepos]           = useState([]);
   const [activeRepo, setActiveRepo] = useState(null);
   const [mode, setMode]             = useState("hybrid");
   const [messages, setMessages]     = useState([]);
   const [input, setInput]           = useState("");
   const [streaming, setStreaming]   = useState(false);
@@ -54,55 +55,109 @@ export default function App() {
     if (!question || streaming) return;
     setInput("");
-    // Add user message
     const userMsg = { role: "user", content: question };
-    // Add placeholder assistant message
     const assistantId = Date.now();
     const assistantMsg = {
       id: assistantId, role: "assistant",
       content: "", sources: [], queryType: null, streaming: true,
     };
     setMessages((prev) => [...prev, userMsg, assistantMsg]);
     setStreaming(true);
-    const stop = streamQuery({
-      question,
-      repo: activeRepo,
-      mode,
-      onToken: (token) => {
-        setMessages((prev) =>
-          prev.map((m) => m.id === assistantId
-            ? { ...m, content: m.content + token }
-            : m
-          )
-        );
-      },
-      onSources: (sources, queryType) => {
-        setMessages((prev) =>
-          prev.map((m) => m.id === assistantId
-            ? { ...m, sources, queryType }
-            : m
-          )
-        );
-      },
-      onDone: () => {
-        setMessages((prev) =>
-          prev.map((m) => m.id === assistantId ? { ...m, streaming: false } : m)
-        );
-        setStreaming(false);
-        stopStream.current = null;
-      },
-      onError: (err) => {
-        setMessages((prev) =>
-          prev.map((m) => m.id === assistantId
-            ? { ...m, content: `Error: ${err}`, streaming: false }
-            : m
-          )
-        );
-        setStreaming(false);
-        stopStream.current = null;
-      },
-    });
     stopStream.current = stop;
   }
@@ -133,6 +188,8 @@ export default function App() {
         onReposChange={loadRepos}
         mode={mode}
         onModeChange={setMode}
       />
       <div className="main">

 import { useState, useEffect, useRef, useCallback } from "react";
 import Sidebar from "./components/Sidebar";
 import Message from "./components/Message";
+import { fetchRepos, streamQuery, streamAgentQuery } from "./api";
 export default function App() {
   const [repos, setRepos]           = useState([]);
   const [activeRepo, setActiveRepo] = useState(null);
   const [mode, setMode]             = useState("hybrid");
+  const [agentMode, setAgentMode]   = useState(false);
   const [messages, setMessages]     = useState([]);
   const [input, setInput]           = useState("");
   const [streaming, setStreaming]   = useState(false);
     if (!question || streaming) return;
     setInput("");
+    // Add user message + placeholder assistant message
     const userMsg = { role: "user", content: question };
     const assistantId = Date.now();
     const assistantMsg = {
       id: assistantId, role: "assistant",
       content: "", sources: [], queryType: null, streaming: true,
+      // Agent-mode extras:
+      toolCalls: [], currentTool: null, iterations: null,
     };
     setMessages((prev) => [...prev, userMsg, assistantMsg]);
     setStreaming(true);
+    // ── Common callbacks ──────────────────────────────────────────────────────
+    const onToken = (token) =>
+      setMessages((prev) =>
+        prev.map((m) => m.id === assistantId ? { ...m, content: m.content + token } : m)
+      );
+    const onError = (err) => {
+      setMessages((prev) =>
+        prev.map((m) => m.id === assistantId
+          ? { ...m, content: `Error: ${err}`, streaming: false }
+          : m
+        )
+      );
+      setStreaming(false);
+      stopStream.current = null;
+    };
+    let stop;
+    if (agentMode) {
+      // ── Agent mode: ReAct loop with live tool-call trace ──────────────────
+      stop = streamAgentQuery({
+        question,
+        repo: activeRepo,
+        onToolCall: (tool, input) => {
+          // Show spinner with tool name while agent is calling
+          setMessages((prev) =>
+            prev.map((m) => m.id === assistantId
+              ? { ...m, currentTool: tool }
+              : m
+            )
+          );
+          // Append to the tool call trace (output will be filled by onToolResult)
+          setMessages((prev) =>
+            prev.map((m) => m.id === assistantId
+              ? { ...m, toolCalls: [...m.toolCalls, { tool, input, output: "" }] }
+              : m
+            )
+          );
+        },
+        onToolResult: (tool, output) => {
+          // Fill in the output of the last tool call in the trace
+          setMessages((prev) =>
+            prev.map((m) => {
+              if (m.id !== assistantId) return m;
+              const calls = [...m.toolCalls];
+              // Find last call for this tool (most recent) and fill its output
+              for (let i = calls.length - 1; i >= 0; i--) {
+                if (calls[i].tool === tool && !calls[i].output) {
+                  calls[i] = { ...calls[i], output };
+                  break;
+                }
+              }
+              return { ...m, toolCalls: calls, currentTool: "thinking" };
+            })
+          );
+        },
+        onToken,
+        onDone: (iterations) => {
+          setMessages((prev) =>
+            prev.map((m) => m.id === assistantId
+              ? { ...m, streaming: false, currentTool: null, iterations }
+              : m
+            )
+          );
+          setStreaming(false);
+          stopStream.current = null;
+        },
+        onError,
+      });
+    } else {
+      // ── Plain RAG mode: single retrieval → stream tokens ──────────────────
+      stop = streamQuery({
+        question,
+        repo: activeRepo,
+        mode,
+        onToken,
+        onSources: (sources, queryType) =>
+          setMessages((prev) =>
+            prev.map((m) => m.id === assistantId ? { ...m, sources, queryType } : m)
+          ),
+        onDone: () => {
+          setMessages((prev) =>
+            prev.map((m) => m.id === assistantId ? { ...m, streaming: false } : m)
+          );
+          setStreaming(false);
+          stopStream.current = null;
+        },
+        onError,
+      });
+    }
     stopStream.current = stop;
   }
         onReposChange={loadRepos}
         mode={mode}
         onModeChange={setMode}
+        agentMode={agentMode}
+        onAgentModeChange={setAgentMode}
       />
       <div className="main">

ui/src/api.js CHANGED Viewed

@@ -70,3 +70,66 @@ export function streamQuery({ question, repo, mode, onToken, onSources, onDone,
   return () => es.close();
 }

   return () => es.close();
 }
+/**
+ * Stream the agentic RAG loop via SSE.
+ *
+ * Unlike streamQuery (one retrieval → tokens), this endpoint shows the
+ * agent's full ReAct reasoning loop in real time:
+ *
+ *   1. agent decides to search → event: tool_call
+ *   2. result comes back       → event: tool_result
+ *   3. agent decides to search again (or answer)
+ *   4. when done, answer streams token-by-token (default events)
+ *   5. event: done signals completion with iteration count
+ *
+ * Callbacks:
+ *   onToolCall(tool, input)    — agent is calling a tool
+ *   onToolResult(tool, output) — tool returned a result
+ *   onToken(text)              — token of the final answer
+ *   onDone(iterations)         — agent finished
+ *   onError(msg)               — connection or server error
+ */
+export function streamAgentQuery({ question, repo, onToolCall, onToolResult, onToken, onDone, onError }) {
+  const params = new URLSearchParams({
+    question,
+    ...(repo ? { repo } : {}),
+  });
+  const es = new EventSource(`${BASE}/agent/stream?${params}`);
+  // Named event: agent is about to call a tool
+  es.addEventListener("tool_call", (e) => {
+    const { tool, input } = JSON.parse(e.data);
+    onToolCall?.(tool, input);
+  });
+  // Named event: tool returned a result
+  es.addEventListener("tool_result", (e) => {
+    const { tool, output } = JSON.parse(e.data);
+    onToolResult?.(tool, output);
+  });
+  // Named event: agent finished
+  es.addEventListener("done", (e) => {
+    const { iterations } = JSON.parse(e.data);
+    onDone?.(iterations);
+  });
+  // Default events: token text (or [DONE] sentinel)
+  es.onmessage = (e) => {
+    if (e.data === "[DONE]") {
+      es.close();
+      return;
+    }
+    const token = e.data.replace(/\\n/g, "\n");
+    onToken?.(token);
+  };
+  es.onerror = () => {
+    es.close();
+    onError?.("Agent connection lost");
+  };
+  return () => es.close();
+}

ui/src/components/Message.jsx CHANGED Viewed

@@ -1,3 +1,4 @@
 import ReactMarkdown from "react-markdown";
 import { Prism as SyntaxHighlighter } from "react-syntax-highlighter";
 import { oneDark } from "react-syntax-highlighter/dist/esm/styles/prism";
@@ -30,6 +31,69 @@ const mdComponents = {
   },
 };
 export default function Message({ msg }) {
   const isUser = msg.role === "user";
@@ -39,20 +103,36 @@ export default function Message({ msg }) {
         <div className="bubble">{msg.content}</div>
       ) : (
         <>
           {/* Answer bubble */}
           <div className="bubble">
             <ReactMarkdown components={mdComponents}>
               {msg.content || " "}
             </ReactMarkdown>
-            {msg.streaming && <span className="cursor" />}
           </div>
-          {/* Query type badge */}
-          {msg.queryType && !msg.streaming && (
             <span className="query-type-badge">{msg.queryType}</span>
           )}
-          {/* Sources */}
           {msg.sources && msg.sources.length > 0 && !msg.streaming && (
             <div className="sources">
               <div className="sources-header">

+import { useState } from "react";
 import ReactMarkdown from "react-markdown";
 import { Prism as SyntaxHighlighter } from "react-syntax-highlighter";
 import { oneDark } from "react-syntax-highlighter/dist/esm/styles/prism";
   },
 };
+// ToolCallTrace shows the agent's reasoning steps.
+//
+// DURING streaming:  shows steps live, expanded, as they accumulate.
+// AFTER completion:  collapses to a toggle button to keep the UI clean.
+//
+// This is the "glass box" view — users can watch the LLM reason in real time,
+// see what it searched for, and what it found, step by step.
+function ToolCallTrace({ steps, streaming }) {
+  const [expanded, setExpanded] = useState(true);
+  if (!steps || steps.length === 0) return null;
+  // Tool name → emoji for quick visual scanning
+  const toolIcon = { search_code: "🔍", get_file_chunk: "📄", find_callers: "🔗" };
+  const stepsEl = (
+    <div className="agent-trace-steps">
+      {steps.map((step, i) => (
+        <div key={i} className={`agent-step ${step.output ? "done" : "pending"}`}>
+          <div className="agent-step-header">
+            <span className="agent-step-icon">{toolIcon[step.tool] || "⚙️"}</span>
+            <span className="agent-step-tool">{step.tool}</span>
+            <span className="agent-step-query">
+              {step.input?.query || step.input?.function_name || JSON.stringify(step.input)}
+            </span>
+            {/* Spinner on the last step while waiting for result */}
+            {!step.output && i === steps.length - 1 && (
+              <span className="spinner" style={{ marginLeft: "auto", flexShrink: 0 }} />
+            )}
+          </div>
+          {step.output && (
+            <div className="agent-step-output">{step.output}</div>
+          )}
+        </div>
+      ))}
+    </div>
+  );
+  if (streaming) {
+    // Live view: always expanded while agent is running
+    return (
+      <div className="agent-trace live">
+        <div className="agent-trace-label">
+          ✦ Agent reasoning · {steps.length} step{steps.length !== 1 ? "s" : ""}
+        </div>
+        {stepsEl}
+      </div>
+    );
+  }
+  // Collapsed view after completion
+  return (
+    <div className="agent-trace">
+      <button
+        className="agent-trace-toggle"
+        onClick={() => setExpanded((v) => !v)}
+      >
+        {expanded ? "▼" : "▶"} Reasoning trace · {steps.length} step{steps.length !== 1 ? "s" : ""}
+      </button>
+      {expanded && stepsEl}
+    </div>
+  );
+}
 export default function Message({ msg }) {
   const isUser = msg.role === "user";
         <div className="bubble">{msg.content}</div>
       ) : (
         <>
+          {/* Agent reasoning trace — live during streaming, collapsible after */}
+          {msg.toolCalls && msg.toolCalls.length > 0 && (
+            <ToolCallTrace steps={msg.toolCalls} streaming={msg.streaming} />
+          )}
+          {/* "Thinking…" shown before the first tool call fires */}
+          {msg.streaming && msg.currentTool === null && !msg.content && (!msg.toolCalls || msg.toolCalls.length === 0) && (
+            <div className="agent-thinking">
+              <span className="spinner" />
+              Thinking…
+            </div>
+          )}
           {/* Answer bubble */}
           <div className="bubble">
             <ReactMarkdown components={mdComponents}>
               {msg.content || " "}
             </ReactMarkdown>
+            {msg.streaming && !msg.currentTool && <span className="cursor" />}
           </div>
+          {/* Query type badge or agent iterations badge */}
+          {!msg.streaming && msg.iterations && (
+            <span className="query-type-badge">agent · {msg.iterations} step{msg.iterations !== 1 ? "s" : ""}</span>
+          )}
+          {!msg.streaming && msg.queryType && !msg.iterations && (
             <span className="query-type-badge">{msg.queryType}</span>
           )}
+          {/* Sources (only for non-agent RAG) */}
           {msg.sources && msg.sources.length > 0 && !msg.streaming && (
             <div className="sources">
               <div className="sources-header">

ui/src/components/Sidebar.jsx CHANGED Viewed

@@ -1,7 +1,7 @@
 import { useState } from "react";
 import { ingestRepo, deleteRepo } from "../api";
-export default function Sidebar({ repos, activeRepo, onSelectRepo, onReposChange, mode, onModeChange }) {
   const [url, setUrl]         = useState("");
   const [status, setStatus]   = useState(null); // {type, text}
   const [loading, setLoading] = useState(false);
@@ -61,22 +61,46 @@ export default function Sidebar({ repos, activeRepo, onSelectRepo, onReposChange
         )}
       </div>
-      {/* ── Search mode ── */}
       <div>
-        <div className="section-label">Search Mode</div>
         <div className="mode-pills">
-          {["hybrid", "semantic", "keyword"].map((m) => (
-            <button
-              key={m}
-              className={`pill ${mode === m ? "active" : ""}`}
-              onClick={() => onModeChange(m)}
-            >
-              {m}
-            </button>
-          ))}
         </div>
       </div>
       {/* ── Repos ── */}
       <div style={{ flex: 1 }}>
         <div className="section-label">Indexed Repos ({repos.length})</div>

 import { useState } from "react";
 import { ingestRepo, deleteRepo } from "../api";
+export default function Sidebar({ repos, activeRepo, onSelectRepo, onReposChange, mode, onModeChange, agentMode, onAgentModeChange }) {
   const [url, setUrl]         = useState("");
   const [status, setStatus]   = useState(null); // {type, text}
   const [loading, setLoading] = useState(false);
         )}
       </div>
+      {/* ── Query mode (RAG vs Agent) ── */}
       <div>
+        <div className="section-label">Query Mode</div>
+        {/* Agent mode toggle — switches between plain RAG and agentic ReAct loop */}
         <div className="mode-pills">
+          <button
+            className={`pill ${!agentMode ? "active" : ""}`}
+            onClick={() => onAgentModeChange(false)}
+            title="Single retrieval, fast answer"
+          >
+            RAG
+          </button>
+          <button
+            className={`pill ${agentMode ? "active" : ""}`}
+            onClick={() => onAgentModeChange(true)}
+            title="Multi-step reasoning, more thorough"
+          >
+            Agent ✦
+          </button>
         </div>
       </div>
+      {/* ── Search mode (only visible in RAG mode) ── */}
+      {!agentMode && (
+        <div>
+          <div className="section-label">Search Mode</div>
+          <div className="mode-pills">
+            {["hybrid", "semantic", "keyword"].map((m) => (
+              <button
+                key={m}
+                className={`pill ${mode === m ? "active" : ""}`}
+                onClick={() => onModeChange(m)}
+              >
+                {m}
+              </button>
+            ))}
+          </div>
+        </div>
+      )}
       {/* ── Repos ── */}
       <div style={{ flex: 1 }}>
         <div className="section-label">Indexed Repos ({repos.length})</div>

ui/src/index.css CHANGED Viewed

@@ -428,6 +428,112 @@ body {
 }
 @keyframes blink { 0%, 100% { opacity: 1; } 50% { opacity: 0; } }
 /* ── Scrollbar ───────────────────────────────────────────────── */
 ::-webkit-scrollbar { width: 6px; }
 ::-webkit-scrollbar-track { background: transparent; }

 }
 @keyframes blink { 0%, 100% { opacity: 1; } 50% { opacity: 0; } }
+/* ── Agent trace ─────────────────────────────────────────────── */
+/* The collapsible "Reasoning trace" block shown above the answer
+   in agent mode — lets users see exactly what the LLM searched. */
+.agent-trace {
+  width: 100%;
+  max-width: 760px;
+  margin-bottom: 8px;
+}
+.agent-trace.live {
+  /* Highlighted border when live reasoning is in progress */
+  border: 1px solid var(--accent);
+  border-radius: 8px;
+  padding: 8px 12px;
+  background: var(--accent-dim);
+}
+.agent-trace-label {
+  font-size: 11px;
+  font-weight: 600;
+  color: var(--accent);
+  text-transform: uppercase;
+  letter-spacing: 0.06em;
+  margin-bottom: 8px;
+}
+.agent-trace-toggle {
+  background: none;
+  border: 1px solid var(--border);
+  border-radius: 6px;
+  color: var(--muted);
+  cursor: pointer;
+  font-size: 11px;
+  font-family: inherit;
+  padding: 4px 10px;
+  transition: border-color 0.15s, color 0.15s;
+}
+.agent-trace-toggle:hover { border-color: var(--accent); color: var(--accent); }
+.agent-trace-steps {
+  margin-top: 6px;
+  display: flex;
+  flex-direction: column;
+  gap: 4px;
+}
+.agent-step {
+  background: var(--surface);
+  border: 1px solid var(--border);
+  border-radius: 6px;
+  padding: 7px 12px;
+  font-size: 12px;
+  transition: border-color 0.2s;
+}
+.agent-step.pending { border-color: var(--accent); }
+.agent-step.done { opacity: 0.85; }
+.agent-step-header {
+  display: flex;
+  align-items: center;
+  gap: 6px;
+}
+.agent-step-icon { font-size: 13px; }
+.agent-step-tool {
+  font-family: "JetBrains Mono", monospace;
+  font-size: 11px;
+  font-weight: 600;
+  color: var(--accent);
+  white-space: nowrap;
+}
+.agent-step-query {
+  color: var(--text);
+  font-size: 12px;
+  white-space: nowrap;
+  overflow: hidden;
+  text-overflow: ellipsis;
+  flex: 1;
+}
+.agent-step-output {
+  margin-top: 5px;
+  color: var(--muted);
+  font-size: 11px;
+  font-family: "JetBrains Mono", monospace;
+  white-space: pre-wrap;
+  word-break: break-word;
+  max-height: 80px;
+  overflow: hidden;
+  mask-image: linear-gradient(to bottom, black 60%, transparent 100%);
+}
+/* Live "agent is thinking" indicator shown while tool calls are in flight */
+.agent-thinking {
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  font-size: 12px;
+  color: var(--muted);
+  margin-bottom: 8px;
+  width: 100%;
+  max-width: 760px;
+}
 /* ── Scrollbar ───────────────────────────────────────────────── */
 ::-webkit-scrollbar { width: 6px; }
 ::-webkit-scrollbar-track { background: transparent; }