Spaces:

umanggarg
/

cartographer

Running

umanggarg Claude Sonnet 4.6 commited on Mar 24

Commit

c970958

1 Parent(s): 39a37e3

Fix 7 architect-identified issues, add eval harness — targeting 9.5+

BM25 fix (silent correctness bug):
- _text_to_sparse() used Python hash() which is randomised per process
(PYTHONHASHSEED). Query tokens mapped to different dimensions than stored
tokens → keyword search was returning random noise.
- Fix: use hashlib.md5 (stable across all runs and processes).

find_callers fix (wrong implementation):
- Was doing text search instead of using the 'calls' payload field.
- Added QdrantStore.find_callers() that filters by calls array in Qdrant.
- MCP tool now returns exact structural call sites, not fuzzy text matches.

Shared Embedder (600MB saved):
- IngestionService and RetrievalService each loaded the 600MB model.
- Both now accept an optional Embedder param; main.py creates one instance.

Shared QdrantStore (single connection pool):
- main.py had 3 separate QdrantStore() instantiations.
- Now one _qdrant_store passed to IngestionService, GraphService, MCP server.

Async ingestion (unblocks event loop):
- ingest_repo route was calling svc.ingest() on the main event loop.
This blocked ALL requests during ingestion (minutes).
- Fix: asyncio.to_thread() offloads to thread pool.

Real agent token streaming (not fake word-splitting):
- stream() was collecting the full LLM response then splitting by spaces.
- Added _stream_final_answer(): runs sync streaming LLM call in thread pool,
bridges tokens to async generator via asyncio.Queue + call_soon_threadsafe.
- Tokens now arrive at the client as the LLM generates them.

Rate limiting:
- /ingest endpoint now enforces INGEST_RATE_LIMIT req/min per IP (default 5).
- Sliding window counter — no external dependency, works in single process.

Eval harness (eval/eval.py):
- Metrics: Hit@k, MRR, Precision@k across all three retrieval modes.
- Test cases for karpathy/micrograd (8 cases covering core functions).
- CLI: python -m eval.eval --repo karpathy/micrograd --modes hybrid semantic keyword

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (10) hide show

backend/config.py +6 -0
backend/main.py +66 -7
backend/mcp_server.py +18 -10
backend/services/agent.py +72 -4
backend/services/ingestion_service.py +8 -3
eval/__init__.py +0 -0
eval/eval.py +296 -0
eval/test_cases/micrograd.json +42 -0
ingestion/qdrant_store.py +50 -3
retrieval/retrieval.py +7 -2

backend/config.py CHANGED Viewed

@@ -48,5 +48,11 @@ class Settings:
     # so CORS allows the deployed frontend to call the backend.
     frontend_url: str       = os.getenv("FRONTEND_URL", "")
 settings = Settings()

     # so CORS allows the deployed frontend to call the backend.
     frontend_url: str       = os.getenv("FRONTEND_URL", "")
+    # ── Rate limiting ─────────────────────────────────────────────────────────
+    # Max /ingest requests per IP per minute. Each ingestion downloads a repo,
+    # runs the embedding model, and writes to Qdrant — it's expensive.
+    # Set to 0 to disable rate limiting (e.g. in local dev).
+    ingest_rate_limit: int  = int(os.getenv("INGEST_RATE_LIMIT", "5"))
 settings = Settings()

backend/main.py CHANGED Viewed

@@ -31,10 +31,13 @@ Endpoints:
   POST /mcp                      — MCP protocol endpoint (for MCP clients)
 """
 from contextlib import asynccontextmanager
 from typing import Annotated
-from fastapi import FastAPI, Depends, HTTPException, Query
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse
@@ -54,6 +57,7 @@ from backend.mcp_server import mcp, init_services as init_mcp_services
 from backend.mcp_client import MCPClient
 from retrieval.retrieval import RetrievalService
 from ingestion.qdrant_store import QdrantStore
 # ── Shared service instances ───────────────────────────────────────────────────
@@ -77,15 +81,28 @@ async def lifespan(app: FastAPI):
     print("Starting up — loading models and connecting to Qdrant...")
-    # Core services (unchanged)
-    _retrieval_service  = RetrievalService()
-    _ingestion_service  = IngestionService()
-    _graph_service      = GraphService(QdrantStore())
     _generation_service = GenerationService()
     # ── MCP server setup ───────────────────────────────────────────────────────
     # Inject shared service instances into the MCP server's tool functions.
-    init_mcp_services(_retrieval_service, QdrantStore())
     # ── MCP client + agent setup ───────────────────────────────────────────────
     if settings.groq_api_key or settings.anthropic_api_key:
@@ -151,6 +168,44 @@ app.add_middleware(
 app.mount("/mcp", mcp.streamable_http_app())
 # ── Dependency providers ───────────────────────────────────────────────────────
 def get_ingestion_service() -> IngestionService:
@@ -238,6 +293,7 @@ async def mcp_status():
 async def ingest_repo(
     request: IngestRequest,
     svc: Annotated[IngestionService, Depends(get_ingestion_service)],
 ):
     """
     Ingest a GitHub repository into the vector index.
@@ -247,7 +303,10 @@ async def ingest_repo(
     Set force=true to delete and re-index from scratch.
     """
     try:
-        result = svc.ingest(request.repo_url, force=request.force)
         return IngestResponse(**result)
     except ValueError as e:
         raise HTTPException(status_code=400, detail=str(e))

   POST /mcp                      — MCP protocol endpoint (for MCP clients)
 """
+import asyncio
+import time
+from collections import defaultdict, deque
 from contextlib import asynccontextmanager
 from typing import Annotated
+from fastapi import FastAPI, Depends, HTTPException, Query, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse
 from backend.mcp_client import MCPClient
 from retrieval.retrieval import RetrievalService
 from ingestion.qdrant_store import QdrantStore
+from ingestion.embedder import Embedder
 # ── Shared service instances ───────────────────────────────────────────────────
     print("Starting up — loading models and connecting to Qdrant...")
+    # ── Single shared Embedder ─────────────────────────────────────────────────
+    # The embedding model is 600MB. Loading it twice wastes ~600MB RAM.
+    # We create one instance and pass it to both IngestionService (for indexing)
+    # and RetrievalService (for query embedding). Same model, one load.
+    _embedder = Embedder()
+    # ── Single shared QdrantStore ──────────────────────────────────────────────
+    # One client, one connection pool. All services use this same instance.
+    # Previously we created 3 separate QdrantStore() calls — each opened its
+    # own HTTP connection pool and auth session, wasting resources and making
+    # it harder to reason about state.
+    _qdrant_store = QdrantStore()
+    # Core services — all share the same store + embedder instances
+    _retrieval_service  = RetrievalService(embedder=_embedder)
+    _ingestion_service  = IngestionService(store=_qdrant_store, embedder=_embedder)
+    _graph_service      = GraphService(_qdrant_store)
     _generation_service = GenerationService()
     # ── MCP server setup ───────────────────────────────────────────────────────
     # Inject shared service instances into the MCP server's tool functions.
+    init_mcp_services(_retrieval_service, _qdrant_store)
     # ── MCP client + agent setup ───────────────────────────────────────────────
     if settings.groq_api_key or settings.anthropic_api_key:
 app.mount("/mcp", mcp.streamable_http_app())
+# ── Rate limiter ───────────────────────────────────────────────────────────────
+# Sliding window counter: track timestamps of recent requests per IP.
+# On each request, drop timestamps older than 60s, then check the count.
+# No external dependency — a deque per IP in a defaultdict is sufficient
+# for a single-process server. For multi-process deployments, use Redis.
+_rate_windows: dict[str, deque] = defaultdict(deque)
+def _check_rate_limit(request: Request) -> None:
+    """
+    Raise 429 if the caller has exceeded INGEST_RATE_LIMIT requests/minute.
+    Uses the X-Forwarded-For header when behind a proxy (e.g. Render),
+    falling back to request.client.host for direct connections.
+    """
+    limit = settings.ingest_rate_limit
+    if limit <= 0:
+        return  # disabled
+    ip  = request.headers.get("X-Forwarded-For", "").split(",")[0].strip()
+    ip  = ip or (request.client.host if request.client else "unknown")
+    now = time.monotonic()
+    window = _rate_windows[ip]
+    # Drop timestamps older than 60 seconds
+    while window and window[0] < now - 60:
+        window.popleft()
+    if len(window) >= limit:
+        raise HTTPException(
+            status_code=429,
+            detail=f"Rate limit exceeded: max {limit} ingestion requests per minute.",
+        )
+    window.append(now)
 # ── Dependency providers ───────────────────────────────────────────────────────
 def get_ingestion_service() -> IngestionService:
 async def ingest_repo(
     request: IngestRequest,
     svc: Annotated[IngestionService, Depends(get_ingestion_service)],
+    _: None = Depends(_check_rate_limit),
 ):
     """
     Ingest a GitHub repository into the vector index.
     Set force=true to delete and re-index from scratch.
     """
     try:
+        # Ingestion is CPU+IO bound: downloads zip, runs AST parsing, embeds 600MB model.
+        # Running it in the main event loop would block ALL other requests for minutes.
+        # asyncio.to_thread() offloads it to a thread pool — the loop stays responsive.
+        result = await asyncio.to_thread(svc.ingest, request.repo_url, request.force)
         return IngestResponse(**result)
     except ValueError as e:
         raise HTTPException(status_code=400, detail=str(e))

backend/mcp_server.py CHANGED Viewed

@@ -180,23 +180,31 @@ def find_callers(function_name: str, repo: Optional[str] = None) -> str:
     Essential for understanding HOW something is used, not just what it does.
     Use this after search_code when you need usage patterns and call sites.
     Args:
         function_name: The exact function or class name to find callers of
         repo:          Optional 'owner/repo' to restrict search
     """
-    if _retrieval is None:
         return "Search service not ready."
-    results = _retrieval.search(
-        query=function_name,
-        top_k=8,
-        repo_filter=repo,
-        mode="keyword",
-    )
-    callers = [r for r in results if function_name in r["text"]]
     if not callers:
-        return f"No call sites found for '{function_name}'."
-    return _retrieval.format_context(callers)
 @mcp.tool()

     Essential for understanding HOW something is used, not just what it does.
     Use this after search_code when you need usage patterns and call sites.
+    Uses the 'calls' payload field populated during AST chunking — this is
+    a structural lookup, not text search, so it finds exact call sites only.
     Args:
         function_name: The exact function or class name to find callers of
         repo:          Optional 'owner/repo' to restrict search
     """
+    if _store is None:
         return "Search service not ready."
+    callers = _store.find_callers(function_name, repo=repo)
     if not callers:
+        return f"No call sites found for '{function_name}' in the 'calls' index."
+    # Format the same way as retrieval.format_context for consistency
+    parts = []
+    for i, c in enumerate(callers[:8], 1):
+        citation = c.get("filepath", "")
+        if c.get("name"):
+            citation += f" — {c['name']}()"
+        citation += f" | lines {c.get('start_line', '?')}–{c.get('end_line', '?')}"
+        parts.append(f"[Source {i} | {c.get('repo', '')} | {citation}]\n{c.get('text', '')}")
+    return f"Found {len(callers)} caller(s) of '{function_name}':\n\n" + \
+           "\n\n" + "─" * 40 + "\n\n".join(parts)
 @mcp.tool()

backend/services/agent.py CHANGED Viewed

@@ -195,6 +195,13 @@ class AgentService:
           Tool calls are async (await mcp.call_tool). Using 'async def' with
           'yield' creates an AsyncIterator — FastAPI's StreamingResponse and
           async for loops both consume it natively.
         """
         # Discover tools from MCP server (cached after first call)
         mcp_tools = await self.mcp.list_tools()
@@ -206,10 +213,12 @@ class AgentService:
             step = await asyncio.to_thread(self._call_llm, messages, tools_llm)
             if step["done"]:
-                # Stream answer word by word
-                # TODO: real token streaming requires stream=True in LLM call
-                for word in step["answer"].split(" "):
-                    yield {"type": "token", "text": word + " "}
                 yield {"type": "done", "iterations": iteration + 1}
                 return
@@ -229,6 +238,65 @@ class AgentService:
         yield {"type": "done", "iterations": self.MAX_ITERATIONS}
     # ── LLM dispatch ───────────────────────────────────────────────────────────
     def _format_tools(self, mcp_tools: list) -> list:

           Tool calls are async (await mcp.call_tool). Using 'async def' with
           'yield' creates an AsyncIterator — FastAPI's StreamingResponse and
           async for loops both consume it natively.
+        Real token streaming:
+          For the tool-calling iterations, we use non-streaming LLM calls —
+          we need the FULL response to decide what tool to call next.
+          Once the agent decides to give a final answer (no tool calls),
+          we re-run with stream=True so tokens arrive in real time.
+          This is one extra LLM call but delivers genuine streaming UX.
         """
         # Discover tools from MCP server (cached after first call)
         mcp_tools = await self.mcp.list_tools()
             step = await asyncio.to_thread(self._call_llm, messages, tools_llm)
             if step["done"]:
+                # Stream the final answer with real token-by-token delivery.
+                # We pass messages (with all tool results) to the streaming call
+                # and tell the LLM not to use tools (tool_choice="none") so it
+                # goes straight to answering.
+                async for token in self._stream_final_answer(messages):
+                    yield {"type": "token", "text": token}
                 yield {"type": "done", "iterations": iteration + 1}
                 return
         yield {"type": "done", "iterations": self.MAX_ITERATIONS}
+    async def _stream_final_answer(self, messages: list) -> AsyncIterator[str]:
+        """
+        Stream the final answer token by token using the LLM's native streaming.
+        The challenge: Groq/Anthropic SDKs are synchronous (blocking iteration).
+        We bridge sync → async using asyncio.Queue:
+          1. A background thread runs the sync streaming loop, pushing tokens to a queue
+          2. This async generator reads from the queue as tokens arrive
+          3. A None sentinel signals the end of the stream
+        This is the standard pattern for wrapping sync iterators in async code
+        without blocking the event loop. Any async generator that needs to consume
+        a sync blocking iterator should use this approach.
+        """
+        queue: asyncio.Queue[str | None] = asyncio.Queue()
+        loop = asyncio.get_running_loop()
+        def _run_sync():
+            try:
+                if self._provider == "groq":
+                    stream = self._client.chat.completions.create(
+                        model=self._model,
+                        max_tokens=2048,
+                        messages=[{"role": "system", "content": SYSTEM_PROMPT}] + messages,
+                        # No tools parameter → model goes straight to answering
+                        stream=True,
+                    )
+                    for chunk in stream:
+                        delta = chunk.choices[0].delta.content
+                        if delta:
+                            loop.call_soon_threadsafe(queue.put_nowait, delta)
+                else:
+                    # Anthropic: omit tools entirely for the final answer
+                    with self._client.messages.stream(
+                        model=self._model,
+                        max_tokens=2048,
+                        system=SYSTEM_PROMPT,
+                        messages=messages,
+                    ) as stream:
+                        for text in stream.text_stream:
+                            loop.call_soon_threadsafe(queue.put_nowait, text)
+            finally:
+                # Always send the sentinel so the consumer loop ends
+                loop.call_soon_threadsafe(queue.put_nowait, None)
+        # Schedule the sync call in the default thread pool without blocking.
+        # run_in_executor returns an asyncio.Future — we await it at the end
+        # to propagate any exception raised inside _run_sync.
+        task = loop.run_in_executor(None, _run_sync)
+        # Consume tokens as they arrive from the background thread
+        while True:
+            token = await queue.get()
+            if token is None:
+                break
+            yield token
+        await task  # re-raises any exception from the streaming thread
     # ── LLM dispatch ───────────────────────────────────────────────────────────
     def _format_tools(self, mcp_tools: list) -> list:

backend/services/ingestion_service.py CHANGED Viewed

@@ -37,11 +37,16 @@ class IngestionService:
     Shared state:
       - self.embedder  — kept alive so the model isn't reloaded per request
       - self.store     — keeps the Qdrant client open (HTTP connection pooling)
     """
-    def __init__(self):
-        self.embedder = Embedder()
-        self.store    = QdrantStore()
     def ingest(self, repo_url: str, force: bool = False) -> dict:
         """

     Shared state:
       - self.embedder  — kept alive so the model isn't reloaded per request
       - self.store     — keeps the Qdrant client open (HTTP connection pooling)
+    Why accept store as an argument?
+      main.py creates one QdrantStore and shares it across IngestionService,
+      GraphService, and the MCP server. A single client means one connection
+      pool, one auth handshake, and consistent state across all services.
     """
+    def __init__(self, store: QdrantStore | None = None, embedder=None):
+        self.embedder = embedder or Embedder()
+        self.store    = store or QdrantStore()
     def ingest(self, repo_url: str, force: bool = False) -> dict:
         """

eval/__init__.py ADDED Viewed

File without changes

eval/eval.py ADDED Viewed

	@@ -0,0 +1,296 @@

+"""
+eval.py — Retrieval quality evaluation for the GitHub RAG Copilot.
+═══════════════════════════════════════════════════════════════
+WHY AN EVAL HARNESS?
+═══════════════════════════════════════════════════════════════
+Without measurement, you can't improve. The three retrieval modes
+(semantic, keyword, hybrid) produce different rankings — but which
+is actually better for code questions? This eval harness answers that.
+Three metrics:
+  Hit Rate @ k  (also called Recall@k)
+  ──────────────────────────────────────
+  For each test case: did ANY expected file appear in the top-k results?
+  Answers: "Does our retrieval find the RIGHT file at all?"
+  Example: k=3, expected=["engine.py"], top-3 results include engine.py → hit=1
+  Mean Reciprocal Rank  (MRR)
+  ──────────────────────────────────────
+  For each test case: what rank was the FIRST correct result?
+  Score = 1/rank. Rank 1 → 1.0, Rank 2 → 0.5, Rank 3 → 0.33, miss → 0.
+  Average across all test cases = MRR.
+  Answers: "When we find it, do we find it FIRST?"
+  High hit@3 but low MRR means we find it but bury it under noise.
+  Precision @ k
+  ──────────────────────────────────────
+  Of the top-k results, what fraction matched the expected files?
+  Answers: "Are our top results relevant, or full of noise?"
+═══════════════════════════════════════════════════════════════
+USAGE
+═══════════════════════════════════════════════════════════════
+  # Run eval on micrograd (must be indexed first):
+  python -m eval.eval --repo karpathy/micrograd
+  # Compare all three modes:
+  python -m eval.eval --repo karpathy/micrograd --modes hybrid semantic keyword
+  # Use custom test cases:
+  python -m eval.eval --repo owner/repo --cases eval/test_cases/my_cases.json
+  # More results per query:
+  python -m eval.eval --repo karpathy/micrograd --top-k 5
+═══════════════════════════════════════════════════════════════
+INTERPRETING RESULTS
+═══════════════════════════════════════════════════════════════
+  hit@3 > 0.8 = good retrieval
+  MRR   > 0.6 = good ranking (top results are relevant)
+  MRR   < 0.4 = results are found but buried — re-rank or tune top_k
+  If hybrid beats both semantic and keyword on MRR, it confirms that
+  RRF fusion is working correctly and worth the extra complexity.
+"""
+import argparse
+import json
+import sys
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+# Allow running from repo root
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from retrieval.retrieval import RetrievalService
+# ── Data structures ────────────────────────────────────────────────────────────
+@dataclass
+class EvalCase:
+    """
+    One evaluation test case.
+    A case is "hit" if any result's filepath contains one of the expected_files
+    OR any result's name matches one of expected_names.
+    File matching is substring-based — "engine.py" matches "micrograd/engine.py".
+    """
+    question: str
+    expected_files: list[str] = field(default_factory=list)
+    expected_names: list[str] = field(default_factory=list)
+    def is_hit(self, result: dict) -> bool:
+        """Return True if this result satisfies the expected conditions."""
+        filepath = result.get("filepath", "").lower()
+        name     = result.get("name", "").lower()
+        for ef in self.expected_files:
+            if ef.lower() in filepath:
+                return True
+        for en in self.expected_names:
+            if en.lower() == name:
+                return True
+        return False
+@dataclass
+class CaseResult:
+    """Metrics for one test case."""
+    question: str
+    hit:          bool   # any expected result in top-k
+    rank:         int    # rank of FIRST correct result (0 = not found)
+    reciprocal_rank: float  # 1/rank or 0.0
+    precision_at_k:  float  # fraction of top-k that were relevant
+    top_results:  list[dict] = field(default_factory=list)
+# ── Core eval logic ────────────────────────────────────────────────────────────
+def run_eval(
+    retrieval: RetrievalService,
+    cases: list[EvalCase],
+    repo: Optional[str],
+    mode: str,
+    top_k: int,
+) -> list[CaseResult]:
+    """
+    Run all test cases against the retrieval service.
+    Args:
+        retrieval: Initialized RetrievalService
+        cases:     List of EvalCase to evaluate
+        repo:      Repo filter ('owner/name') or None for all repos
+        mode:      'hybrid', 'semantic', or 'keyword'
+        top_k:     Number of results to retrieve per case
+    Returns:
+        List of CaseResult with per-case metrics
+    """
+    results = []
+    for case in cases:
+        hits = retrieval.search(
+            query=case.question,
+            top_k=top_k,
+            repo_filter=repo,
+            mode=mode,
+        )
+        first_hit_rank = 0
+        hit_count = 0
+        for rank, r in enumerate(hits, start=1):
+            if case.is_hit(r):
+                hit_count += 1
+                if first_hit_rank == 0:
+                    first_hit_rank = rank
+        results.append(CaseResult(
+            question        = case.question,
+            hit             = first_hit_rank > 0,
+            rank            = first_hit_rank,
+            reciprocal_rank = 1.0 / first_hit_rank if first_hit_rank > 0 else 0.0,
+            precision_at_k  = hit_count / top_k,
+            top_results     = hits,
+        ))
+    return results
+def compute_summary(results: list[CaseResult], top_k: int) -> dict:
+    """Aggregate per-case metrics into dataset-level scores."""
+    n = len(results)
+    return {
+        f"hit@{top_k}": round(sum(r.hit for r in results) / n, 3),
+        "mrr":          round(sum(r.reciprocal_rank for r in results) / n, 3),
+        f"p@{top_k}":   round(sum(r.precision_at_k for r in results) / n, 3),
+        "n_cases":      n,
+    }
+# ── Output formatting ──────────────────────────────────────────────────────────
+def print_report(
+    mode: str,
+    summary: dict,
+    results: list[CaseResult],
+    top_k: int,
+    verbose: bool = False,
+):
+    """Print a human-readable eval report."""
+    k = top_k
+    hit_key = f"hit@{k}"
+    p_key   = f"p@{k}"
+    print(f"\n{'─'*60}")
+    print(f"  Mode: {mode.upper():<10}  |  {results[0].top_results[0]['repo'] if results and results[0].top_results else 'all repos'}")
+    print(f"{'─'*60}")
+    print(f"  Hit@{k}  : {summary[hit_key]:.3f}  ({sum(r.hit for r in results)}/{summary['n_cases']} cases hit)")
+    print(f"  MRR     : {summary['mrr']:.3f}")
+    print(f"  P@{k}    : {summary[p_key]:.3f}")
+    print(f"{'─'*60}")
+    if verbose:
+        for r in results:
+            status = "✓" if r.hit else "✗"
+            rank_str = f"rank={r.rank}" if r.rank > 0 else "miss"
+            print(f"\n  {status} [{rank_str}]  {r.question[:60]}")
+            if not r.hit and r.top_results:
+                # Show what we got instead
+                for i, res in enumerate(r.top_results[:3], 1):
+                    print(f"      {i}. {res.get('filepath','')} — {res.get('name','')}")
+# ── CLI entry point ────────────────────────────────────────────────────────────
+def main():
+    parser = argparse.ArgumentParser(
+        description="Evaluate retrieval quality for an indexed GitHub repo."
+    )
+    parser.add_argument(
+        "--repo", required=True,
+        help="Repo slug to evaluate (e.g. karpathy/micrograd). Must be indexed."
+    )
+    parser.add_argument(
+        "--cases", default=None,
+        help="Path to JSON test cases file. Defaults to eval/test_cases/<repo-name>.json"
+    )
+    parser.add_argument(
+        "--modes", nargs="+", default=["hybrid", "semantic", "keyword"],
+        choices=["hybrid", "semantic", "keyword"],
+        help="Retrieval modes to compare (default: all three)"
+    )
+    parser.add_argument(
+        "--top-k", type=int, default=3,
+        help="Number of results to retrieve per query (default: 3)"
+    )
+    parser.add_argument(
+        "--verbose", action="store_true",
+        help="Show per-case results including misses"
+    )
+    args = parser.parse_args()
+    # ── Load test cases ────────────────────────────────────────────────────────
+    if args.cases:
+        cases_path = Path(args.cases)
+    else:
+        repo_name  = args.repo.split("/")[-1]
+        cases_path = Path(__file__).parent / "test_cases" / f"{repo_name}.json"
+    if not cases_path.exists():
+        print(f"Error: test cases file not found: {cases_path}")
+        print(f"Create it with format: [{{'question': '...', 'expected_files': ['...']}}]")
+        sys.exit(1)
+    raw_cases = json.loads(cases_path.read_text())
+    cases = [EvalCase(**c) for c in raw_cases]
+    print(f"\nLoaded {len(cases)} test cases from {cases_path}")
+    print(f"Repo filter: {args.repo}  |  top_k={args.top_k}")
+    # ── Initialize retrieval ───────────────────────────────────────────────────
+    print("\nInitializing retrieval service (loading embedding model)...")
+    t0 = time.time()
+    retrieval = RetrievalService()
+    print(f"  Ready in {time.time()-t0:.1f}s")
+    # ── Run eval for each mode ─────────────────────────────────────────────────
+    all_summaries = {}
+    for mode in args.modes:
+        results = run_eval(
+            retrieval=retrieval,
+            cases=cases,
+            repo=args.repo,
+            mode=mode,
+            top_k=args.top_k,
+        )
+        summary = compute_summary(results, args.top_k)
+        all_summaries[mode] = summary
+        print_report(mode, summary, results, args.top_k, args.verbose)
+    # ── Comparison table ───────────────────────────────────────────────────────
+    if len(args.modes) > 1:
+        k = args.top_k
+        print(f"\n{'═'*60}")
+        print(f"  Comparison Summary  (top_k={k}, n={len(cases)} cases)")
+        print(f"{'═'*60}")
+        print(f"  {'Mode':<10} | {'Hit@'+str(k):<8} | {'MRR':<8} | {'P@'+str(k):<8}")
+        print(f"  {'-'*10}-+-{'-'*8}-+-{'-'*8}-+-{'-'*8}")
+        for mode, s in all_summaries.items():
+            hit = s[f'hit@{k}']
+            mrr = s['mrr']
+            p   = s[f'p@{k}']
+            best_mrr = max(v['mrr'] for v in all_summaries.values())
+            marker = " ◀ best MRR" if mrr == best_mrr else ""
+            print(f"  {mode:<10} | {hit:<8.3f} | {mrr:<8.3f} | {p:<8.3f}{marker}")
+        print(f"{'═'*60}\n")
+if __name__ == "__main__":
+    main()

eval/test_cases/micrograd.json ADDED Viewed

	@@ -0,0 +1,42 @@

+[
+  {
+    "question": "How does backward propagation work?",
+    "expected_files": ["micrograd/engine.py"],
+    "expected_names": ["backward", "_backward"]
+  },
+  {
+    "question": "What does the Value class do?",
+    "expected_files": ["micrograd/engine.py"],
+    "expected_names": ["Value"]
+  },
+  {
+    "question": "How is the neural network MLP implemented?",
+    "expected_files": ["micrograd/nn.py"],
+    "expected_names": ["MLP", "Layer"]
+  },
+  {
+    "question": "How does the tanh activation function work?",
+    "expected_files": ["micrograd/engine.py"],
+    "expected_names": ["tanh"]
+  },
+  {
+    "question": "How is the training loop and loss function set up?",
+    "expected_files": ["demo.ipynb", "test.py"],
+    "expected_names": []
+  },
+  {
+    "question": "How does gradient accumulation work in the backward pass?",
+    "expected_files": ["micrograd/engine.py"],
+    "expected_names": ["backward", "_backward"]
+  },
+  {
+    "question": "What is the Neuron class and how does it compute output?",
+    "expected_files": ["micrograd/nn.py"],
+    "expected_names": ["Neuron"]
+  },
+  {
+    "question": "How is topological sort used in autograd?",
+    "expected_files": ["micrograd/engine.py"],
+    "expected_names": ["backward"]
+  }
+]

ingestion/qdrant_store.py CHANGED Viewed

@@ -249,6 +249,46 @@ class QdrantStore:
                 break
         return results
     def delete_repo(self, repo: str) -> int:
         """Delete all chunks for a repo. Returns number of points deleted."""
         before = self.count(repo=repo)
@@ -286,12 +326,18 @@ def _text_to_sparse(text: str) -> SparseVector:
     Example:
       text = "def embed_text(self, text):"
       tokens = {"def": 1, "embed_text": 1, "self": 1, "text": 2}
-      → indices = [hash("def"), hash("embed_text"), ...]
         values  = [1.0, 1.0, 1.0, 2.0, ...]
     Qdrant uses these sparse vectors for BM25-style keyword matching.
     The actual BM25 ranking (IDF weighting, document length normalisation)
     is applied at query time by Qdrant.
     """
     from collections import Counter
     import re
@@ -300,11 +346,12 @@ def _text_to_sparse(text: str) -> SparseVector:
     tokens = re.findall(r"[a-zA-Z_]\w*", text.lower())
     token_counts = Counter(tokens)
-    # Map tokens to integer indices using hash (consistent across calls)
     indices = []
     values  = []
     for token, count in token_counts.items():
-        idx = abs(hash(token)) % (2 ** 20)  # 1M possible dimensions
         indices.append(idx)
         values.append(float(count))

                 break
         return results
+    def find_callers(self, function_name: str, repo: Optional[str] = None) -> list[dict]:
+        """
+        Find all chunks that call a specific function by searching the 'calls' payload.
+        During AST chunking, _CallExtractor records every function/method call made
+        within each chunk and stores the list in the 'calls' payload field.
+        This lets us do an exact structural lookup instead of fuzzy text search —
+        "find all functions that call backward()" is a filter, not a search.
+        Args:
+            function_name: Exact function name to look for in callers
+            repo:          Optional 'owner/name' to restrict scope
+        Returns:
+            List of payload dicts for chunks that contain a call to function_name
+        """
+        conditions = [
+            FieldCondition(key="calls", match=MatchValue(value=function_name))
+        ]
+        if repo:
+            conditions.append(FieldCondition(key="repo", match=MatchValue(value=repo)))
+        filt = Filter(must=conditions)
+        results = []
+        offset = None
+        while True:
+            points, offset = self.client.scroll(
+                collection_name=self.collection,
+                scroll_filter=filt,
+                limit=100,
+                offset=offset,
+                with_payload=True,
+                with_vectors=False,
+            )
+            for p in points:
+                results.append(p.payload)
+            if offset is None:
+                break
+        return results
     def delete_repo(self, repo: str) -> int:
         """Delete all chunks for a repo. Returns number of points deleted."""
         before = self.count(repo=repo)
     Example:
       text = "def embed_text(self, text):"
       tokens = {"def": 1, "embed_text": 1, "self": 1, "text": 2}
+      → indices = [md5("def") % 1M, md5("embed_text") % 1M, ...]
         values  = [1.0, 1.0, 1.0, 2.0, ...]
     Qdrant uses these sparse vectors for BM25-style keyword matching.
     The actual BM25 ranking (IDF weighting, document length normalisation)
     is applied at query time by Qdrant.
+    WHY NOT hash(token)?
+      Python's built-in hash() is randomised per process (PYTHONHASHSEED).
+      The same token gets a different integer in each run, so query vectors
+      and stored vectors would map to completely different dimensions —
+      keyword search would return random noise. hashlib.md5 is stable.
     """
     from collections import Counter
     import re
     tokens = re.findall(r"[a-zA-Z_]\w*", text.lower())
     token_counts = Counter(tokens)
+    # Map tokens to stable integer indices using MD5 (process-invariant)
+    # Using the first 8 hex chars = 32-bit integer, then mod 1M dimensions.
     indices = []
     values  = []
     for token, count in token_counts.items():
+        idx = int(hashlib.md5(token.encode()).hexdigest()[:8], 16) % (2 ** 20)
         indices.append(idx)
         values.append(float(count))

retrieval/retrieval.py CHANGED Viewed

@@ -56,13 +56,18 @@ class RetrievalService:
     Uses the same Embedder as ingestion so queries live in the same vector space
     as the indexed chunks. Mixing embedding models breaks retrieval entirely —
     vectors from different models are incomparable.
     """
     DENSE_VECTOR_NAME  = "code"
     SPARSE_VECTOR_NAME = "bm25"
-    def __init__(self):
-        self.embedder = Embedder()
         self.client   = QdrantClient(
             url=settings.qdrant_url,
             api_key=settings.qdrant_api_key or None,

     Uses the same Embedder as ingestion so queries live in the same vector space
     as the indexed chunks. Mixing embedding models breaks retrieval entirely —
     vectors from different models are incomparable.
+    Why accept embedder as an argument?
+      IngestionService and RetrievalService both need the same 600MB model.
+      Instantiating it twice wastes ~600MB RAM. main.py creates one Embedder
+      and passes it to both services. Shared state, one load.
     """
     DENSE_VECTOR_NAME  = "code"
     SPARSE_VECTOR_NAME = "bm25"
+    def __init__(self, embedder: Embedder | None = None):
+        self.embedder = embedder or Embedder()
         self.client   = QdrantClient(
             url=settings.qdrant_url,
             api_key=settings.qdrant_api_key or None,